this adds in date math to change from two months to two weeks for the feed length

minor corrections
updated blank with changes from other file
2024-03-17 19:07:28 -04:00 · 2024-03-07 14:28:03 -05:00 · 2024-03-07 14:22:17 -05:00 · 2024-03-07 14:15:03 -05:00 · 2024-03-07 14:14:46 -05:00 · 2024-03-06 19:53:25 -05:00
3 changed files with 93 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -10,8 +10,8 @@ The following CPAN modules need to be installed on your server:

 * LWP::Simple
 * LWP::Protocol::https
-* XML::RSS; 
-* HTML::Entities;
+* XML::RSS
+* HTML::Entities

 # Configuration

@ -28,10 +28,17 @@ This area is for general information on the site and feed.
 * feed URL
 * copyright 

-There are two files you have to save, update these with the path of the folders on your server where they will be saved. Examples are given for CPanel type servers:
+There are three files you have to save, update these with the path of the folders on your server where they will be saved. 
+
+* Feed - the RSS feed generated for the site, should be an xml file
+* HTML - the site itself, should be an html file
+* Error Log - this lists URLs that did not work - should be a txt file 
+
+Example paths are given for CPanel type servers:

 * Feed - /home/USER_NAME/public_html/feed.xml 
-* HTML - /home/USER_NAME/public_html/index.xml
+* HTML - /home/USER_NAME/public_html/index.html
+* Error Log - /home/USER_FOLDER/public_html/feed.log 

 ## The Site

--- a/blankRSS.pl
+++ b/blankRSS.pl
@ -10,7 +10,7 @@ use warnings;
 #  blankRSS.pl
 # 
 # This script pulls from a list of RSS feeds and agregates them together into a web page. 
-# It is designed to run as a cron and overright the HTML file.
+# It is designed to run as a cron and overwrite the HTML file.
 # 
 # license GPLv3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
 # Code repository: https://code.jacobhaddon.com/jake/smhn
@ -28,10 +28,12 @@ use HTML::Entities; # https://metacpan.org/pod/HTML::Entities
 # server file folders
 # my $rssFilePath = "/home/USER_FOLDER/public_html/feed.xml";
 # my $htmlFilePath = "/home/USER_FOLDER/public_html/index.html";
+# my $errorFilePath = "/home/USER_FOLDER/public_html/feed.log";

 # local file folders
 my $rssFilePath = "feed.xml";
 my $htmlFilePath = "index.html";
+my $errorFilePath = "feed.log";

 ###################################
 #  RSS Configurations 
@ -63,6 +65,10 @@ my %list;
 # Make the list of URLS while parsing DATA
 my $listHTML = "<div class=\'listHTML\'><ul>\n";

+# Make a list of URLs that have an error
+my $listURLError = "The following feeds had issues this time:\n\n" . $now->strftime('%a, %d %b %Y %H:%M:%S %z'). "\n\n";
+
+# Go through each URL in the DATA section and make the new list
 while ( my $url = <DATA>) {
    chomp $url;
    
@ -75,7 +81,12 @@ while ( my $url = <DATA>) {
 	# parse the XML
    my $rss1 = XML::RSS->new;
    eval { $rss1->parse( $xml ) };
-    next if $@;
+    
+    # if empty, add URL to log file variable
+ 	if ($@) {
+    	$listURLError .= "* " . $url . "\n";
+    	next; 
+    }; # if $@
    
    # go through the items from the XML
    for (my $j = 0; $j <= $number_of_items; $j++){
@ -96,14 +107,29 @@ while ( my $url = <DATA>) {
 			$testItem->{'feedURL'} = $_;
 			$testItem->{'feedName'} = $rss1->{'channel'}{'title'};
 			
+			# Find Author tags 
+			if ($testItem->{'dc'}{'creator'}) {
+				$testItem->{'itemAuthor'} = $testItem->{'dc'}{'creator'};
+			} elsif ($testItem->{'author'}) {
+				$testItem->{'itemAuthor'} = $testItem->{'author'}
+			} else {
+				$testItem->{'itemAuthor'} = $rss1->{'channel'}{'title'};
+			} # if author  
+						
 			# Clean up some of the artifacts in the RSS feed 'description' section
 			$testItem->{'description'} =~ s/\n\s*/\n/g; # get rid of excess white space
 			$testItem->{'description'} =~ s/(<a(.+?)<\/a>)$//s; # link at end of description
 			$testItem->{'description'} =~ s/<(\w) class=(.*?)>/<$1>/s; # remove class statements from text
 			$testItem->{'description'} =~ s/<figure ((.|\n)+?)figure>//sg; #remove "figure" can use pipe to add more
 			$testItem->{'description'} =~ s/<img(.+?)>//sg; # remove IMG tags
+			$testItem->{'description'} =~ s/<span ((.|\n)+?)>//sg; #remove "span" tags (mostly blogger)
+			$testItem->{'description'} =~ s/<\/span>//sg; #remove "span" endtags 
+			$testItem->{'description'} =~ s/<div class="separator" style(.+?)<\/div>//sg; # remove blogger DIV tags
+			$testItem->{'description'} =~ s/<br(.+?)>/<br>/sg; # remove blogger BR tags
+			$testItem->{'description'} =~ s/(<div><br><\/div>)+/<br>/sg; # remove blogger BR + DIV tags
 			$testItem->{'description'} =~ s/.{1200}\K.*//s; # limit length 
-			$testItem->{'description'} =~ s/(<[^<]+)$//s; # link at end of description
+			$testItem->{'description'} =~ s/<\/?div.*?>//sg; # remove div tags
+			$testItem->{'description'} =~ s/(<a[^<]+)$//s; # link at end of description
 			
 			#add continue reading to end. 
 			$testItem->{'description'} .= " <a href=\'" . $testItem->{'link'} . "\' target=\'_blank\'>Continue Reading</a>"; 
@ -122,6 +148,14 @@ while ( my $url = <DATA>) {
 #close out the list URL html
 $listHTML .= "</div></ul>\n";

+###################################
+#  Write the error file 
+###################################
+
+open(FH, '>', $errorFilePath) or die $!;
+print FH $listURLError;
+close(FH);
+
 ###################################
 #  Make an RSS Feed! 
 ###################################
--- a/smhnRSS.pl
+++ b/smhnRSS.pl
@ -10,7 +10,7 @@ use warnings;
 # The Sunday Morning Horror News
 # 
 # This script pulls from a list of RSS feeds and agregates them together into a web page. 
-# It is designed to run as a cron and overright the HTML file.
+# It is designed to run as a cron and overwrite the HTML file.
 # 
 # license GPLv3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
 # Code repository: https://code.jacobhaddon.com/jake/smhn
@ -21,6 +21,7 @@ use warnings;
 # Packages 

 use Time::Piece; # https://perldoc.perl.org/Time::Piece
+use Time::Seconds; # https://perldoc.perl.org/Time::Seconds
 use LWP::Simple; # https://metacpan.org/pod/LWP::Simple
 use XML::RSS; # https://metacpan.org/pod/XML::RSS
 use HTML::Entities; # https://metacpan.org/pod/HTML::Entities
@ -28,10 +29,12 @@ use HTML::Entities; # https://metacpan.org/pod/HTML::Entities
 # server file folders
 # my $rssFilePath = "/home/USER_FOLDER/public_html/feed.xml";
 # my $htmlFilePath = "/home/USER_FOLDER/public_html/index.html";
+# my $errorFilePath = "/home/USER_FOLDER/public_html/feed.log";

 # local file folders
 my $rssFilePath = "feed.xml";
 my $htmlFilePath = "index.html";
+my $errorFilePath = "feed.log";

 ###################################
 #  RSS Configurations 
@ -50,9 +53,12 @@ my $copyright = 'Copyright respective writers';
 # add to new RSS feed object
 ###################################

+# number of weeks in the past to hold RSS feed
+my $num_weeks = 2;
+
 # get today, subtact time to make cut off
 my $now = localtime;
-my $then = $now->add_months(-2);
+my $then = $now - (ONE_WEEK * $num_weeks); 

 #number of items to keep from each feed
 my $number_of_items = 2; # +1 since everything starts at 0
@ -63,6 +69,9 @@ my %list;
 # Make the list of URLS while parsing DATA
 my $listHTML = "<div class=\'listHTML\'><ul>\n";

+# Make a list of URLs that have an error
+my $listURLError = "The following feeds had issues this time:\n\n" . $now->strftime('%a, %d %b %Y %H:%M:%S %z'). "\n\n";
+
 # Go through each URL in the DATA section and make the new list
 while ( my $url = <DATA>) {
    chomp $url;
@ -76,7 +85,12 @@ while ( my $url = <DATA>) {
 	# parse the XML
    my $rss1 = XML::RSS->new;
    eval { $rss1->parse( $xml ) };
-    next if $@;
+    
+    # if empty, add URL to log file variable
+ 	if ($@) {
+    	$listURLError .= "* " . $url . "\n";
+    	next; 
+    }; # if $@
    
    # go through the items from the XML
    for (my $j = 0; $j <= $number_of_items; $j++){
@ -138,6 +152,14 @@ while ( my $url = <DATA>) {
 #close out the list URL html
 $listHTML .= "</ul></div>\n";

+###################################
+#  Write the error file 
+###################################
+
+open(FH, '>', $errorFilePath) or die $!;
+print FH $listURLError;
+close(FH);
+
 ###################################
 #  Make an RSS Feed! 
 ###################################
@ -204,6 +226,9 @@ my $printDate = formatDate($rss2->{'channel'}{'pubDate'});
 # header for a direct HTML post 
 my $html_header = "Status: 200\nContent-type: text/html\n\n";

+
+
+
 ###################################
 #   Make the HTML Page
 ###################################
@ -353,7 +378,7 @@ http://fiendlover.blogspot.com/feeds/posts/default
 http://jacobhaddon.com/feed/
 http://apokrupha.com/feed/
 https://ellendatlow.com/feed/
-https://paulaguran.com/
+https://paulaguran.com/feed/
 https://amandaheadlee.com/feed/
 https://theimbloglio.wordpress.com/feed/
 https://kennethwcain.com/feed/
@ -364,4 +389,19 @@ https://weightlessbooks.com/feed/
 https://www.crystallakepub.com/feed/
 https://lynnehansen.zenfolio.com/blog.rss
 https://www.bevvincent.com/feed/
-http://liviallewellyn.com/feed/
+http://liviallewellyn.com/feed/
+https://www.kristidemeester.com/blog-feed.xml
+https://www.lucysnyder.com/index.php/feed/
+https://www.emilyruthverona.com/blog-feed.xml
+https://www.elizabethhand.com/welcome?format=rss
+https://www.jamielackey.com/feed/
+https://cv-hunt.com/feed/
+https://authorjenniferallisprovost.com/feed/
+https://jezzywolfe.wordpress.com/feed/
+https://lmariewood.com/feed/
+https://www.leemurray.info/blog-feed.xml
+https://meghanarcuri.com/feed/
+https://nicolecushing.wordpress.com/feed/
+https://saratantlinger.com/feed/
+https://sunnymoraine.com/feed/
+https://lauramauro.com/feed/
Author	SHA1	Message	Date
Jacob Haddon	e8b01b292c	this adds in date math to change from two months to two weeks for the feed length	2024-03-17 19:07:28 -04:00
Jacob Haddon	211dede75a	minor corrections	2024-03-07 14:28:03 -05:00
Jacob Haddon	7f0f20d779	updated blank with changes from other file	2024-03-07 14:22:17 -05:00
Jacob Haddon	1cb4d8d03d	Merge branch 'smhn'	2024-03-07 14:15:03 -05:00
Jacob Haddon	5efc041ed9	updated readme	2024-03-07 14:14:46 -05:00
Jacob Haddon	889aae9f6a	this ads an error log into the RSS feed loop, logging which URLs fail parsing	2024-03-06 19:53:25 -05:00