smhn/blankRSS.pl

#!/usr/bin/perl
# use cPanelUserConfig; #for cpanel servers

use 5.010;
use strict;
use warnings;

###################################
#
#  blankRSS.pl
#
# This script pulls from a list of RSS feeds and agregates them together into a web page.
# It is designed to run as a cron and overright the HTML file.
#
# license GPLv3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
# Code repository: https://code.jacobhaddon.com/jake/smhn
# Written by Jacob Haddon https://jacobhaddon.com
#
###################################

# Packages

use Time::Piece; # https://perldoc.perl.org/Time::Piece
use LWP::Simple; # https://metacpan.org/pod/LWP::Simple
use XML::RSS; # https://metacpan.org/pod/XML::RSS
use HTML::Entities; # https://metacpan.org/pod/HTML::Entities

# server file folders
# my $rssFilePath = "/home/USER_FOLDER/public_html/feed.xml";
# my $htmlFilePath = "/home/USER_FOLDER/public_html/index.html";
# my $errorFilePath = "/home/USER_FOLDER/public_html/feed.log";

# local file folders
my $rssFilePath = "feed.xml";
my $htmlFilePath = "index.html";
my $errorFilePath = "feed.log";

###################################
#  RSS Configurations
###################################

my $title = "The Title of My Site";
my $homeLink = "http://example.com";
my $feedLink = "http://example/feed.xml";
my $description = 'A description of my feed, it should be one line in length.';
my $webmaster = 'webmaster@example.com';
my $copyright = 'Copyright respective writers';

###################################
# Go through list of URLs, get RSS feed,
# take newest 3 that are less than $then old,
# add to new RSS feed object
###################################

# get today, subtact time to make cut off
my $now = localtime;
my $then = $now->add_months(-2);

#number of items to keep from each feed
my $number_of_items = 2; # +1 since everything starts at 0

#list to hold the new RSS items
my %list;

# Make the list of URLS while parsing DATA
my $listHTML = "<div class=\'listHTML\'><ul>\n";

# Make a list of URLs that have an error
my $listURLError = "The following feeds had issues this time:\n\n" . $now->strftime('%a, %d %b %Y %H:%M:%S %z'). "\n\n";

# Go through each URL in the DATA section and make the new list
while ( my $url = <DATA>) {
    chomp $url;

    # get the XML from the URL
	my $ua = LWP::UserAgent->new;
	$ua->default_header('User-Agent' => 'Mozilla/5.0');
	my $resp = $ua->get($url) or next;
	my $xml = $resp->decoded_content;

	# parse the XML
    my $rss1 = XML::RSS->new;
    eval { $rss1->parse( $xml ) };

    # if empty, add URL to log file variable
 	if ($@) {
    	$listURLError .= "* " . $url . "\n";
    	next;
    }; # if $@

    # go through the items from the XML
    for (my $j = 0; $j <= $number_of_items; $j++){

		my $testItem =  @{ $rss1->{'items'}}[$j];

		# Get the pub date of the article
		my $testDate = $testItem->{'pubDate'};

		# Strip out the milliseconds
		$testDate =~ s/((\+|\-)(\d\d\d\d))//;

		# Convert to a TIME object
		my $t = Time::Piece->strptime($testDate);

		# If the post is older than one year, do not include it
		if ($t->epoch > $then->epoch) {
			$testItem->{'feedURL'} = $_;
			$testItem->{'feedName'} = $rss1->{'channel'}{'title'};

			# Find Author tags
			if ($testItem->{'dc'}{'creator'}) {
				$testItem->{'itemAuthor'} = $testItem->{'dc'}{'creator'};
			} elsif ($testItem->{'author'}) {
				$testItem->{'itemAuthor'} = $testItem->{'author'}
			} else {
				$testItem->{'itemAuthor'} = $rss1->{'channel'}{'title'};
			} # if author

			# Clean up some of the artifacts in the RSS feed 'description' section
			$testItem->{'description'} =~ s/\n\s*/\n/g; # get rid of excess white space
			$testItem->{'description'} =~ s/(<a(.+?)<\/a>)$//s; # link at end of description
			$testItem->{'description'} =~ s/<(\w) class=(.*?)>/<$1>/s; # remove class statements from text
			$testItem->{'description'} =~ s/<figure ((.|\n)+?)figure>//sg; #remove "figure" can use pipe to add more
			$testItem->{'description'} =~ s/<img(.+?)>//sg; # remove IMG tags
			$testItem->{'description'} =~ s/<span ((.|\n)+?)>//sg; #remove "span" tags (mostly blogger)
			$testItem->{'description'} =~ s/<\/span>//sg; #remove "span" endtags
			$testItem->{'description'} =~ s/<div class="separator" style(.+?)<\/div>//sg; # remove blogger DIV tags
			$testItem->{'description'} =~ s/<br(.+?)>/<br>/sg; # remove blogger BR tags
			$testItem->{'description'} =~ s/(<div><br><\/div>)+/<br>/sg; # remove blogger BR + DIV tags
			$testItem->{'description'} =~ s/.{1200}\K.*//s; # limit length
			$testItem->{'description'} =~ s/<\/?div.*?>//sg; # remove div tags
			$testItem->{'description'} =~ s/(<a[^<]+)$//s; # link at end of description

			#add continue reading to end.
			$testItem->{'description'} .= " <a href=\'" . $testItem->{'link'} . "\' target=\'_blank\'>Continue Reading</a>";

			# Set Epoch as KEY so easier to arrange in order
			$list{$t->epoch} = $testItem;
		} #if epoch

    } # for i loop

    # add to the list of URL HTML
	$listHTML .= "<li>" . $url . "</li>\n";

} # while DATA

#close out the list URL html
$listHTML .= "</div></ul>\n";

###################################
#  Write the error file
###################################

open(FH, '>', $errorFilePath) or die $!;
print FH $listURLError;
close(FH);

###################################
#  Make an RSS Feed!
###################################

# date format: Thu, 28 Dec 2023 03:51:42
# $now->strftime("%a, %d %b %Y %H:%M:%S %z");

my $rss2 = XML::RSS->new (version => '2.0');
$rss2->add_module(prefix => 'atom', uri => 'http://www.w3.org/2005/Atom');

$rss2->channel(title          => $title,
               link           => $homeLink,
               language       => 'en-US',
               description    => $description,
               copyright      => $copyright,
               pubDate        => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
               lastBuildDate  => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
               webMaster      => $webmaster,
               atom           => { 'link' => { 'href' => $feedLink, 'rel' => 'self', 'type' => 'application/rss+xml' } }
               ); # $rss->channel

# foreach ITEM, newest (highest EPOCH) first
foreach my $name (reverse sort keys %list) {

    $rss2->add_item(title => $list{$name}->{'title'},
        	permaLink  => $list{$name}->{'link'},
        	link  => $list{$name}->{'link'},
        	description => $list{$name}->{'description'},
        	pubDate => $list{$name}->{'pubDate'},
        	author => $list{$name}->{'dc'}{'creator'},
        	source => $list{$name}->{'feedName'},
        	sourceUrl => $list{$name}->{'feedURL'},
       ); # $rss->channel

} # foreach

# Save the RSS feed as a file
$rss2->save($rssFilePath);

###################################
#   Format the RSS data for HTML
###################################

# make the HTML for the processed RSS
my $rssHTML = "<div class=\'rssHTML'>\n";

# print the title and link of each RSS item
	foreach my $item (@{$rss2->{'items'}}) {

		$rssHTML .= "<h2 class=\'headline\'><a href='" . $item->{'link'} . "' target=\'_blank\'>" . encode_entities($item->{'title'}) . "</a></h2>\n\n";
		$rssHTML .= "<div class=\'byline\'>" . encode_entities($item->{'author'}) . " - " . formatDate($item->{'pubDate'}) ."</div>\n\n";
		$rssHTML .= "<div class=\'description\'>" . $item->{'description'} . "</div>\n\n";

	} # foreach item

# close out the rssHTML
$rssHTML .= "</div>\n\n";

# the webpage HTML

# format the pubDate
my $printDate = formatDate($rss2->{'channel'}{'pubDate'});

# header for a direct HTML post
my $html_header = "Status: 200\nContent-type: text/html\n\n";

###################################
#   Make the HTML Page
###################################

my $html = <<"HTML_END";
<!DOCTYPE html>
<html>

<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>$title</title>
	<style>
		body{
			margin:40px auto;
			max-width:650px;
			line-height:1.6;
			font-size:18px;
			font-family: Baskerville, TimesNewRoman, Times New Roman, Times, Georgia, serif;
			padding:0 10px
		}
		h1, h2, h3 {font-weight: 200;}
		hr{width:50%;}
		.byline{font-style:italic;}
		nav{
			font-size:20px;
			text-align: center;
		}
		a {text-decoration:none;}
		a:hover {text-decoration:underline;}

		/* Dark Mode Colors */
		\@media screen and (prefers-color-scheme: dark) {
			body{
				color: white;
				background-color: black;
			}
			a {color:orange;}
			a:visited{color:yellow;}

		} /* media screen */
	</style>
</head>

<body>

<header>
	<h1>$title</h1>
	<p>$description</p>
	<p>updated: $printDate</p>
</header>

<nav>
	<a href="#news">News</a> - <a href="#about">About</a> - <a href="#contact">Contact</a> - <a href="#list">List</a> - <a href="$feedLink">RSS Feed</a>
</nav>

<section id="news">
	<h1>The News!</h1>

	$rssHTML

</section>

<section id="about">
	<h1>About $title</h1>
	<p>$description</p>
</section>

<section id="contact">
	<h1>Contact</h1>
	<p>Contact $title at: $webmaster
</section>

<section id="list">
	<h1>The List!</h1>
	<p>This is the list of the feeds that we are checking. If you have an RSS reader, grab them and follow along!</p>

	$listHTML

</section>

<hr>

<footer>
	<p>A footer</p>
	<p><a href="https://code.jacobhaddon.com/jake/smhn">Code</a> by Jacob Haddon - license <a href="https://www.gnu.org/licenses/gpl-3.0.en.html">GPLv3.0</a> - <a href="https://Apokrupha.com">Apokrupha.com</a></p>
</footer>

<hr>

</body>
</html>

HTML_END

# write the file
open(FH, '>', $htmlFilePath) or die $!;
print FH $html;
close(FH);

# print $html_header . $html;

###################################
#  Functions
###################################

sub formatDate {
	my $testDate = $_[0];
	$testDate =~ s/((\+|\-)(\d\d\d\d))//;
	# Convert to a TIME object
	my $t = Time::Piece->strptime($testDate);
	return($t->strftime('%a, %d %b %Y %H:%M:%S'));
}

# FIN

###################################
#  DATA is list of the feed URLs
###################################

__DATA__
https://nnw.ranchero.com/feed.json
https://feeds.npr.org/1001/rss.xml