Perl script for aggregating RSS and outputing an HTML and feed file. Blank

version and the SMHN specific configuration added
2024-02-13 14:13:11 -05:00 · 2024-02-13 14:13:11 -05:00 · c220728298
commit c220728298
parent ca2f76ee12
3 changed files with 655 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 index.html
 feed.xml
--- a/blankRSS.pl
+++ b/blankRSS.pl
@ -0,0 +1,313 @@
 #!/usr/bin/perl
 # use cPanelUserConfig; #for cpanel servers
 use 5.010;
 use strict;
 use warnings;
 ###################################
 #
 #  blankRSS.pl
 # 
 # This script pulls from a list of RSS feeds and agregates them together into a web page. 
 # It is designed to run as a cron and overright the HTML file.
 # 
 # license GPLv3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
 # Code repository: https://code.jacobhaddon.com/jake/smhn
 # Written by Jacob Haddon https://jacobhaddon.com
 #
 ###################################
 # Packages 
 use Time::Piece; # https://perldoc.perl.org/Time::Piece
 use LWP::Simple; # https://metacpan.org/pod/LWP::Simple
 use XML::RSS; # https://metacpan.org/pod/XML::RSS
 use HTML::Entities; # https://metacpan.org/pod/HTML::Entities
 # server file folders
 # my $rssFilePath = "/home/USER_FOLDER/public_html/feed.xml";
 # my $htmlFilePath = "/home/USER_FOLDER/public_html/index.html";
 # local file folders
 my $rssFilePath = "feed.xml";
 my $htmlFilePath = "index.html";
 ###################################
 #  RSS Configurations 
 ###################################
 my $title = "The Title of My Site";
 my $homeLink = "http://example.com";
 my $feedLink = "http://example/feed.xml";
 my $description = 'A description of my feed, it should be one line in length.';
 my $webmaster = 'webmaster@example.com';
 my $copyright = 'Copyright respective writers';
 ###################################
 # Go through list of URLs, get RSS feed, 
 # take newest 3 that are less than $then old, 
 # add to new RSS feed object
 ###################################
 # get today, subtact time to make cut off
 my $now = localtime;
 my $then = $now->add_months(-2);
 #number of items to keep from each feed
 my $number_of_items = 2; # +1 since everything starts at 0
 #list to hold the new RSS items
 my %list; 
 # Make the list of URLS while parsing DATA
 my $listHTML = "<div class=\'listHTML\'><ul>\n";
 while ( my $url = <DATA>) {
    chomp $url;
    # get the XML from the URL
 	my $ua = LWP::UserAgent->new;
 	$ua->default_header('User-Agent' => 'Mozilla/5.0');
 	my $resp = $ua->get($url) or next;
 	my $xml = $resp->decoded_content;
 	# parse the XML
    my $rss1 = XML::RSS->new;
    eval { $rss1->parse( $xml ) };
    next if $@;
    # go through the items from the XML
    for (my $j = 0; $j <= $number_of_items; $j++){
 		my $testItem =  @{ $rss1->{'items'}}[$j];
 		# Get the pub date of the article
 		my $testDate = $testItem->{'pubDate'};
 		# Strip out the milliseconds 
 		$testDate =~ s/((\+|\-)(\d\d\d\d))//; 
 		# Convert to a TIME object
 		my $t = Time::Piece->strptime($testDate);
 		# If the post is older than one year, do not include it
 		if ($t->epoch > $then->epoch) {		
 			$testItem->{'feedURL'} = $_;
 			$testItem->{'feedName'} = $rss1->{'channel'}{'title'};
 			# Clean up some of the artifacts in the RSS feed 'description' section
 			$testItem->{'description'} =~ s/\n\s*/\n/g; # get rid of excess white space
 			$testItem->{'description'} =~ s/(<a(.+?)<\/a>)$//s; # link at end of description
 			$testItem->{'description'} =~ s/<(\w) class=(.*?)>/<$1>/s; # remove class statements from text
 			$testItem->{'description'} =~ s/<figure ((.|\n)+?)figure>//sg; #remove "figure" can use pipe to add more
 			$testItem->{'description'} =~ s/<img(.+?)>//sg; # remove IMG tags
 			$testItem->{'description'} =~ s/.{1200}\K.*//s; # limit length 
 			$testItem->{'description'} =~ s/(<[^<]+)$//s; # link at end of description
 			#add continue reading to end. 
 			$testItem->{'description'} .= " <a href=\'" . $testItem->{'link'} . "\' target=\'_blank\'>Continue Reading</a>"; 
 			# Set Epoch as KEY so easier to arrange in order
 			$list{$t->epoch} = $testItem;
 		} #if epoch
    } # for i loop
    # add to the list of URL HTML 
 	$listHTML .= "<li>" . $url . "</li>\n";
 } # while DATA
 #close out the list URL html
 $listHTML .= "</div></ul>\n";
 ###################################
 #  Make an RSS Feed! 
 ###################################
 # date format: Thu, 28 Dec 2023 03:51:42
 # $now->strftime("%a, %d %b %Y %H:%M:%S %z");
 my $rss2 = XML::RSS->new (version => '2.0');
 $rss2->add_module(prefix => 'atom', uri => 'http://www.w3.org/2005/Atom');
 $rss2->channel(title          => $title,
               link           => $homeLink,
               language       => 'en-US',
               description    => $description,
               copyright      => $copyright,
               pubDate        => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
               lastBuildDate  => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
               webMaster      => $webmaster,
               atom           => { 'link' => { 'href' => $feedLink, 'rel' => 'self', 'type' => 'application/rss+xml' } }
               ); # $rss->channel
 # foreach ITEM, newest (highest EPOCH) first
 foreach my $name (reverse sort keys %list) {
    $rss2->add_item(title => $list{$name}->{'title'},
        	permaLink  => $list{$name}->{'link'},
        	link  => $list{$name}->{'link'},
        	description => $list{$name}->{'description'},
        	pubDate => $list{$name}->{'pubDate'},
        	author => $list{$name}->{'dc'}{'creator'},
        	source => $list{$name}->{'feedName'},
        	sourceUrl => $list{$name}->{'feedURL'},
       ); # $rss->channel
 } # foreach
 # Save the RSS feed as a file    
 $rss2->save($rssFilePath);
 ###################################
 #   Format the RSS data for HTML
 ###################################
 # make the HTML for the processed RSS 
 my $rssHTML = "<div class=\'rssHTML'>\n";
 # print the title and link of each RSS item
 	foreach my $item (@{$rss2->{'items'}}) {
 		$rssHTML .= "<h2 class=\'headline\'><a href='" . $item->{'link'} . "' target=\'_blank\'>" . encode_entities($item->{'title'}) . "</a></h2>\n\n"; 
 		$rssHTML .= "<div class=\'byline\'>" . encode_entities($item->{'author'}) . " - " . formatDate($item->{'pubDate'}) ."</div>\n\n";
 		$rssHTML .= "<div class=\'description\'>" . $item->{'description'} . "</div>\n\n";
 	} # foreach item 
 # close out the rssHTML
 $rssHTML .= "</div>\n\n";
 # the webpage HTML 
 # format the pubDate
 my $printDate = formatDate($rss2->{'channel'}{'pubDate'});
 # header for a direct HTML post 
 my $html_header = "Status: 200\nContent-type: text/html\n\n";
 ###################################
 #   Make the HTML Page
 ###################################
 my $html = <<"HTML_END";
 <!DOCTYPE html>
 <html>
 <head>
 	<meta charset="utf-8">
 	<meta name="viewport" content="width=device-width, initial-scale=1">
 	<title>$title</title>
 	<style>
 		body{
 			margin:40px auto;
 			max-width:650px;
 			line-height:1.6;
 			font-size:18px;
 			font-family: Baskerville, TimesNewRoman, Times New Roman, Times, Georgia, serif;
 			padding:0 10px
 		}
 		h1, h2, h3 {font-weight: 200;}
 		hr{width:50%;}
 		.byline{font-style:italic;}
 		nav{
 			font-size:20px;
 			text-align: center;
 		}
 		a {text-decoration:none;}
 		a:hover {text-decoration:underline;}
 		/* Dark Mode Colors */
 		\@media screen and (prefers-color-scheme: dark) {
 			body{ 
 				color: white;
 				background-color: black;
 			}
 			a {color:orange;}
 			a:visited{color:yellow;}
 		} /* media screen */
 	</style>
 </head>
 <body>
 <header>
 	<h1>$title</h1>
 	<p>$description</p>
 	<p>updated: $printDate</p>
 </header>
 <nav>
 	<a href="#news">News</a> - <a href="#about">About</a> - <a href="#contact">Contact</a> - <a href="#list">List</a> - <a href="$feedLink">RSS Feed</a>
 </nav>
 <section id="news">
 	<h1>The News!</h1>
 	$rssHTML
 </section>
 <section id="about">
 	<h1>About $title</h1>
 	<p>$description</p>
 </section>
 <section id="contact">
 	<h1>Contact</h1>
 	<p>Contact $title at: $webmaster 
 </section>
 <section id="list">
 	<h1>The List!</h1>
 	<p>This is the list of the feeds that we are checking. If you have an RSS reader, grab them and follow along!</p>
 	$listHTML
 </section>
 <hr>
 <footer>
 	<p>A footer</p>
 	<p><a href="https://code.jacobhaddon.com/jake/smhn">Code</a> by Jacob Haddon - license <a href="https://www.gnu.org/licenses/gpl-3.0.en.html">GPLv3.0</a> - <a href="https://Apokrupha.com">Apokrupha.com</a></p>
 </footer>
 <hr>
 </body>
 </html> 
 HTML_END
 # write the file 
 open(FH, '>', $htmlFilePath) or die $!;
 print FH $html;
 close(FH);
 # print $html_header . $html;
 ###################################
 #  Functions 
 ###################################
 sub formatDate {
 	my $testDate = $_[0];
 	$testDate =~ s/((\+|\-)(\d\d\d\d))//; 
 	# Convert to a TIME object
 	my $t = Time::Piece->strptime($testDate);
 	return($t->strftime('%a, %d %b %Y %H:%M:%S'));
 }
 # FIN
 ###################################
 #  DATA is list of the feed URLs
 ###################################
 __DATA__
 https://nnw.ranchero.com/feed.json
 https://feeds.npr.org/1001/rss.xml
--- a/smhnRSS.pl
+++ b/smhnRSS.pl
@ -0,0 +1,340 @@
 #!/usr/bin/perl
 use cPanelUserConfig; #for cpanel servers
 use 5.010;
 use strict;
 use warnings;
 ###################################
 #
 # The Sunday Morning Horror News
 # 
 # This script pulls from a list of RSS feeds and agregates them together into a web page. 
 # It is designed to run as a cron and overright the HTML file.
 # 
 # license GPLv3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
 # Code repository: https://code.jacobhaddon.com/jake/smhn
 # Written by Jacob Haddon https://jacobhaddon.com
 #
 ###################################
 # Packages 
 use Time::Piece; # https://perldoc.perl.org/Time::Piece
 use LWP::Simple; # https://metacpan.org/pod/LWP::Simple
 use XML::RSS; # https://metacpan.org/pod/XML::RSS
 use HTML::Entities; # https://metacpan.org/pod/HTML::Entities
 # server file folders
 # my $rssFilePath = "/home/USER_FOLDER/public_html/feed.xml";
 # my $htmlFilePath = "/home/USER_FOLDER/public_html/index.html";
 # local file folders
 my $rssFilePath = "feed.xml";
 my $htmlFilePath = "index.html";
 ###################################
 #  RSS Configurations 
 ###################################
 my $title = "Sunday Morning Horror News";
 my $homeLink = "http://sundaymorninghorrornews.com";
 my $feedLink = "http://sundaymorninghorrornews.com/feed.xml";
 my $description = 'A collection of horror writer, editor, poet and publisher blogs from all over the web!';
 my $webmaster = 'webmaster@sundaymorninghorrornews.com';
 my $copyright = 'Copyright respective writers';
 ###################################
 # Go through list of URLs, get RSS feed, 
 # take newest 3 that are less than $then old, 
 # add to new RSS feed object
 ###################################
 # get today, subtact time to make cut off
 my $now = localtime;
 my $then = $now->add_months(-2);
 #number of items to keep from each feed
 my $number_of_items = 2; # +1 since everything starts at 0
 #list to hold the new RSS items
 my %list; 
 # Make the list of URLS while parsing DATA
 my $listHTML = "<div class=\'listHTML\'><ul>\n";
 while ( my $url = <DATA>) {
    chomp $url;
    # get the XML from the URL
 	my $ua = LWP::UserAgent->new;
 	$ua->default_header('User-Agent' => 'Mozilla/5.0');
 	my $resp = $ua->get($url) or next;
 	my $xml = $resp->decoded_content;
 	# parse the XML
    my $rss1 = XML::RSS->new;
    eval { $rss1->parse( $xml ) };
    next if $@;
    # go through the items from the XML
    for (my $j = 0; $j <= $number_of_items; $j++){
 		my $testItem =  @{ $rss1->{'items'}}[$j];
 		# Get the pub date of the article
 		my $testDate = $testItem->{'pubDate'};
 		# Strip out the milliseconds 
 		$testDate =~ s/((\+|\-)(\d\d\d\d))//; 
 		# Convert to a TIME object
 		my $t = Time::Piece->strptime($testDate);
 		# If the post is older than one year, do not include it
 		if ($t->epoch > $then->epoch) {		
 			$testItem->{'feedURL'} = $_;
 			$testItem->{'feedName'} = $rss1->{'channel'}{'title'};
 			# Clean up some of the artifacts in the RSS feed 'description' section
 			$testItem->{'description'} =~ s/\n\s*/\n/g; # get rid of excess white space
 			$testItem->{'description'} =~ s/(<a(.+?)<\/a>)$//s; # link at end of description
 			$testItem->{'description'} =~ s/<(\w) class=(.*?)>/<$1>/s; # remove class statements from text
 			$testItem->{'description'} =~ s/<figure ((.|\n)+?)figure>//sg; #remove "figure" can use pipe to add more
 			$testItem->{'description'} =~ s/<img(.+?)>//sg; # remove IMG tags
 			$testItem->{'description'} =~ s/.{1200}\K.*//s; # limit length 
 			$testItem->{'description'} =~ s/(<[^<]+)$//s; # link at end of description
 			#add continue reading to end. 
 			$testItem->{'description'} .= " <a href=\'" . $testItem->{'link'} . "\' target=\'_blank\'>Continue Reading</a>"; 
 			# Set Epoch as KEY so easier to arrange in order
 			$list{$t->epoch} = $testItem;
 		} #if epoch
    } # for i loop
    # add to the list of URL HTML 
 	$listHTML .= "<li>" . $url . "</li>\n";
 } # while DATA
 #close out the list URL html
 $listHTML .= "</div></ul>\n";
 ###################################
 #  Make an RSS Feed! 
 ###################################
 # date format: Thu, 28 Dec 2023 03:51:42
 # $now->strftime("%a, %d %b %Y %H:%M:%S %z");
 my $rss2 = XML::RSS->new (version => '2.0');
 $rss2->add_module(prefix => 'atom', uri => 'http://www.w3.org/2005/Atom');
 $rss2->channel(title          => $title,
               link           => $homeLink,
               language       => 'en-US',
               description    => $description,
               copyright      => $copyright,
               pubDate        => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
               lastBuildDate  => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
               webMaster      => $webmaster,
               atom           => { 'link' => { 'href' => $feedLink, 'rel' => 'self', 'type' => 'application/rss+xml' } }
               ); # $rss->channel
 # foreach ITEM, newest (highest EPOCH) first
 foreach my $name (reverse sort keys %list) {
    $rss2->add_item(title => $list{$name}->{'title'},
        	permaLink  => $list{$name}->{'link'},
        	link  => $list{$name}->{'link'},
        	description => $list{$name}->{'description'},
        	pubDate => $list{$name}->{'pubDate'},
        	author => $list{$name}->{'dc'}{'creator'},
        	source => $list{$name}->{'feedName'},
        	sourceUrl => $list{$name}->{'feedURL'},
       ); # $rss->channel
 } # foreach
 # Save the RSS feed as a file    
 $rss2->save($rssFilePath);
 ###################################
 #   Format the RSS data for HTML
 ###################################
 # make the HTML for the processed RSS 
 my $rssHTML = "<div class=\'rssHTML'>\n";
 # print the title and link of each RSS item
 	foreach my $item (@{$rss2->{'items'}}) {
 		$rssHTML .= "<h2 class=\'headline\'><a href='" . $item->{'link'} . "' target=\'_blank\'>" . encode_entities($item->{'title'}) . "</a></h2>\n\n"; 
 		$rssHTML .= "<div class=\'byline\'>" . encode_entities($item->{'author'}) . " - " . formatDate($item->{'pubDate'}) ."</div>\n\n";
 		$rssHTML .= "<div class=\'description\'>" . $item->{'description'} . "</div>\n\n";
 	} # foreach item 
 # close out the rssHTML
 $rssHTML .= "</div>\n\n";
 # the webpage HTML 
 # format the pubDate
 my $printDate = formatDate($rss2->{'channel'}{'pubDate'});
 # header for a direct HTML post 
 my $html_header = "Status: 200\nContent-type: text/html\n\n";
 ###################################
 #   Make the HTML Page
 ###################################
 my $html = <<"HTML_END";
 <!DOCTYPE html>
 <html>
 <head>
 	<meta charset="utf-8">
 	<meta name="viewport" content="width=device-width, initial-scale=1">
 	<title>The Sunday Morning Horror News</title>
 	<style>
 		body{
 			margin:40px auto;
 			max-width:650px;
 			line-height:1.6;
 			font-size:18px;
 			font-family: Baskerville, TimesNewRoman, Times New Roman, Times, Georgia, serif;
 			padding:0 10px
 		}
 		h1, h2, h3 {font-weight: 200;}
 		hr{width:50%;}
 		.byline{font-style:italic;}
 		nav{
 			font-size:20px;
 			text-align: center;
 		}
 		a {text-decoration:none;}
 		a:hover {text-decoration:underline;}
 		/* Dark Mode Colors */
 		\@media screen and (prefers-color-scheme: dark) {
 			body{ 
 				color: white;
 				background-color: black;
 			}
 			a {color:orange;}
 			a:visited{color:yellow;}
 		} /* media screen */
 	</style>
 </head>
 <body>
 <header>
 	<h1>$title</h1>
 	<p>$description</p>
 	<p>This site is updated every Sunday morning</p>
 	<p>updated: $printDate</p>
 </header>
 <nav>
 <a href="#news">News</a> - <a href="#about">About</a> - <a href="#contact">Contact</a> - <a href="#list">List</a> - <a href="$feedLink">RSS Feed</a>
 </nav>
 <section id="news">
 	<h1>The News!</h1>
 	$rssHTML
 </section>
 <section id="about">
 	<h1>About $title</h1>
 	<p>The $title is an aggregator of RSS feeds from writers, poets and publishers.</p>
 	<p>Too often the world is flying by at the speed of social media. $title is designed to update just once a week (on Sunday!) so you can take your time and catch up on the news without having to scroll.</p>
 </section>
 <section id="contact">
 	<h1>Contact</h1>
 	<p>Contact $title at: $webmaster 
 	<p>We are intolerant of intolerance. If you see something hateful, transphobic, homophobic, racist, or the like, let us know.</p>
 	<p>We don't publish the content you see here, just collect it, but we don't need to collect trash.</p>
 </section>
 <section id="list">
 	<h1>The List!</h1>
 	<p>This is the list of the feeds that we are checking. If you have an RSS reader, grab them and follow along!</p>
 	$listHTML
 </section>
 <hr>
 <footer>
 	<p>This news feed made with <a href="https://www.perl.org/">Perl</a> and <a href="https://www.barebones.com/products/bbedit/">BBEdit</a>. Best viewable in Netscape Navigator 4 or higher</p>
 	<p>Open web features like RSS make sites like this possible. Support the open web!</p>
 	<p>this is a <a href="http://motherfuckingwebsite.com/">motherfucking website</a> and a <a href="http://bettermotherfuckingwebsite.com/">better motherfucking</a> website.</p>
 	<p><a href="https://code.jacobhaddon.com/jake/smhn">Code</a> by Jacob Haddon - license <a href="https://www.gnu.org/licenses/gpl-3.0.en.html">GPLv3.0</a> - <a href="https://Apokrupha.com">Apokrupha.com</a></p>
 </footer>
 <hr>
 </body>
 </html> 
 HTML_END
 # write the file 
 open(FH, '>', $htmlFilePath) or die $!;
 print FH $html;
 close(FH);
 # print $html_header . $html;
 ###################################
 #  Functions 
 ###################################
 sub formatDate {
 	my $testDate = $_[0];
 	$testDate =~ s/((\+|\-)(\d\d\d\d))//; 
 	# Convert to a TIME object
 	my $t = Time::Piece->strptime($testDate);
 	return($t->strftime('%a, %d %b %Y %H:%M:%S'));
 }
 # FIN
 ###################################
 #  DATA is list of the feed URLs
 ###################################
 __DATA__
 https://ecatherine.com/feed/
 https://redlagoe.com/feed/
 https://diebooth.wordpress.com/feed/
 https://kelliowen.com/feed/
 https://mercedesmyardley.com/feed/
 https://mehitobel.com/feed/
 http://antoncancre.blogspot.com/feeds/posts/default?alt=rss
 https://www.toddkeisling.com/news?format=rss
 https://www.jfgonzalez.org/news?format=rss
 https://www.briankeene.com/news?format=rss
 https://ghoulish.rip/feed/
 https://www.cemeterydance.com/extras/feed/
 https://www.gwendolynkiste.com/Blog/feed/
 https://katherinesilvaauthor.substack.com/feed
 https://queenofswordspress.com/feed/
 http://fiendlover.blogspot.com/feeds/posts/default
 http://jacobhaddon.com/feed/
 http://apokrupha.com/feed/
 https://ellendatlow.com/feed/
 https://paulaguran.com/