smhn/blankRSS.pl
2024-03-07 14:22:17 -05:00

347 lines
10 KiB
Perl
Executable File

#!/usr/bin/perl
# use cPanelUserConfig; #for cpanel servers
use 5.010;
use strict;
use warnings;
###################################
#
# blankRSS.pl
#
# This script pulls from a list of RSS feeds and agregates them together into a web page.
# It is designed to run as a cron and overright the HTML file.
#
# license GPLv3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
# Code repository: https://code.jacobhaddon.com/jake/smhn
# Written by Jacob Haddon https://jacobhaddon.com
#
###################################
# Packages
use Time::Piece; # https://perldoc.perl.org/Time::Piece
use LWP::Simple; # https://metacpan.org/pod/LWP::Simple
use XML::RSS; # https://metacpan.org/pod/XML::RSS
use HTML::Entities; # https://metacpan.org/pod/HTML::Entities
# server file folders
# my $rssFilePath = "/home/USER_FOLDER/public_html/feed.xml";
# my $htmlFilePath = "/home/USER_FOLDER/public_html/index.html";
# my $errorFilePath = "/home/USER_FOLDER/public_html/feed.log";
# local file folders
my $rssFilePath = "feed.xml";
my $htmlFilePath = "index.html";
my $errorFilePath = "feed.log";
###################################
# RSS Configurations
###################################
my $title = "The Title of My Site";
my $homeLink = "http://example.com";
my $feedLink = "http://example/feed.xml";
my $description = 'A description of my feed, it should be one line in length.';
my $webmaster = 'webmaster@example.com';
my $copyright = 'Copyright respective writers';
###################################
# Go through list of URLs, get RSS feed,
# take newest 3 that are less than $then old,
# add to new RSS feed object
###################################
# get today, subtact time to make cut off
my $now = localtime;
my $then = $now->add_months(-2);
#number of items to keep from each feed
my $number_of_items = 2; # +1 since everything starts at 0
#list to hold the new RSS items
my %list;
# Make the list of URLS while parsing DATA
my $listHTML = "<div class=\'listHTML\'><ul>\n";
# Make a list of URLs that have an error
my $listURLError = "The following feeds had issues this time:\n\n" . $now->strftime('%a, %d %b %Y %H:%M:%S %z'). "\n\n";
# Go through each URL in the DATA section and make the new list
while ( my $url = <DATA>) {
chomp $url;
# get the XML from the URL
my $ua = LWP::UserAgent->new;
$ua->default_header('User-Agent' => 'Mozilla/5.0');
my $resp = $ua->get($url) or next;
my $xml = $resp->decoded_content;
# parse the XML
my $rss1 = XML::RSS->new;
eval { $rss1->parse( $xml ) };
# if empty, add URL to log file variable
if ($@) {
$listURLError .= "* " . $url . "\n";
next;
}; # if $@
# go through the items from the XML
for (my $j = 0; $j <= $number_of_items; $j++){
my $testItem = @{ $rss1->{'items'}}[$j];
# Get the pub date of the article
my $testDate = $testItem->{'pubDate'};
# Strip out the milliseconds
$testDate =~ s/((\+|\-)(\d\d\d\d))//;
# Convert to a TIME object
my $t = Time::Piece->strptime($testDate);
# If the post is older than one year, do not include it
if ($t->epoch > $then->epoch) {
$testItem->{'feedURL'} = $_;
$testItem->{'feedName'} = $rss1->{'channel'}{'title'};
# Find Author tags
if ($testItem->{'dc'}{'creator'}) {
$testItem->{'itemAuthor'} = $testItem->{'dc'}{'creator'};
} elsif ($testItem->{'author'}) {
$testItem->{'itemAuthor'} = $testItem->{'author'}
} else {
$testItem->{'itemAuthor'} = $rss1->{'channel'}{'title'};
} # if author
# Clean up some of the artifacts in the RSS feed 'description' section
$testItem->{'description'} =~ s/\n\s*/\n/g; # get rid of excess white space
$testItem->{'description'} =~ s/(<a(.+?)<\/a>)$//s; # link at end of description
$testItem->{'description'} =~ s/<(\w) class=(.*?)>/<$1>/s; # remove class statements from text
$testItem->{'description'} =~ s/<figure ((.|\n)+?)figure>//sg; #remove "figure" can use pipe to add more
$testItem->{'description'} =~ s/<img(.+?)>//sg; # remove IMG tags
$testItem->{'description'} =~ s/<span ((.|\n)+?)>//sg; #remove "span" tags (mostly blogger)
$testItem->{'description'} =~ s/<\/span>//sg; #remove "span" endtags
$testItem->{'description'} =~ s/<div class="separator" style(.+?)<\/div>//sg; # remove blogger DIV tags
$testItem->{'description'} =~ s/<br(.+?)>/<br>/sg; # remove blogger BR tags
$testItem->{'description'} =~ s/(<div><br><\/div>)+/<br>/sg; # remove blogger BR + DIV tags
$testItem->{'description'} =~ s/.{1200}\K.*//s; # limit length
$testItem->{'description'} =~ s/<\/?div.*?>//sg; # remove div tags
$testItem->{'description'} =~ s/(<a[^<]+)$//s; # link at end of description
#add continue reading to end.
$testItem->{'description'} .= " <a href=\'" . $testItem->{'link'} . "\' target=\'_blank\'>Continue Reading</a>";
# Set Epoch as KEY so easier to arrange in order
$list{$t->epoch} = $testItem;
} #if epoch
} # for i loop
# add to the list of URL HTML
$listHTML .= "<li>" . $url . "</li>\n";
} # while DATA
#close out the list URL html
$listHTML .= "</div></ul>\n";
###################################
# Write the error file
###################################
open(FH, '>', $errorFilePath) or die $!;
print FH $listURLError;
close(FH);
###################################
# Make an RSS Feed!
###################################
# date format: Thu, 28 Dec 2023 03:51:42
# $now->strftime("%a, %d %b %Y %H:%M:%S %z");
my $rss2 = XML::RSS->new (version => '2.0');
$rss2->add_module(prefix => 'atom', uri => 'http://www.w3.org/2005/Atom');
$rss2->channel(title => $title,
link => $homeLink,
language => 'en-US',
description => $description,
copyright => $copyright,
pubDate => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
lastBuildDate => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
webMaster => $webmaster,
atom => { 'link' => { 'href' => $feedLink, 'rel' => 'self', 'type' => 'application/rss+xml' } }
); # $rss->channel
# foreach ITEM, newest (highest EPOCH) first
foreach my $name (reverse sort keys %list) {
$rss2->add_item(title => $list{$name}->{'title'},
permaLink => $list{$name}->{'link'},
link => $list{$name}->{'link'},
description => $list{$name}->{'description'},
pubDate => $list{$name}->{'pubDate'},
author => $list{$name}->{'dc'}{'creator'},
source => $list{$name}->{'feedName'},
sourceUrl => $list{$name}->{'feedURL'},
); # $rss->channel
} # foreach
# Save the RSS feed as a file
$rss2->save($rssFilePath);
###################################
# Format the RSS data for HTML
###################################
# make the HTML for the processed RSS
my $rssHTML = "<div class=\'rssHTML'>\n";
# print the title and link of each RSS item
foreach my $item (@{$rss2->{'items'}}) {
$rssHTML .= "<h2 class=\'headline\'><a href='" . $item->{'link'} . "' target=\'_blank\'>" . encode_entities($item->{'title'}) . "</a></h2>\n\n";
$rssHTML .= "<div class=\'byline\'>" . encode_entities($item->{'author'}) . " - " . formatDate($item->{'pubDate'}) ."</div>\n\n";
$rssHTML .= "<div class=\'description\'>" . $item->{'description'} . "</div>\n\n";
} # foreach item
# close out the rssHTML
$rssHTML .= "</div>\n\n";
# the webpage HTML
# format the pubDate
my $printDate = formatDate($rss2->{'channel'}{'pubDate'});
# header for a direct HTML post
my $html_header = "Status: 200\nContent-type: text/html\n\n";
###################################
# Make the HTML Page
###################################
my $html = <<"HTML_END";
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>$title</title>
<style>
body{
margin:40px auto;
max-width:650px;
line-height:1.6;
font-size:18px;
font-family: Baskerville, TimesNewRoman, Times New Roman, Times, Georgia, serif;
padding:0 10px
}
h1, h2, h3 {font-weight: 200;}
hr{width:50%;}
.byline{font-style:italic;}
nav{
font-size:20px;
text-align: center;
}
a {text-decoration:none;}
a:hover {text-decoration:underline;}
/* Dark Mode Colors */
\@media screen and (prefers-color-scheme: dark) {
body{
color: white;
background-color: black;
}
a {color:orange;}
a:visited{color:yellow;}
} /* media screen */
</style>
</head>
<body>
<header>
<h1>$title</h1>
<p>$description</p>
<p>updated: $printDate</p>
</header>
<nav>
<a href="#news">News</a> - <a href="#about">About</a> - <a href="#contact">Contact</a> - <a href="#list">List</a> - <a href="$feedLink">RSS Feed</a>
</nav>
<section id="news">
<h1>The News!</h1>
$rssHTML
</section>
<section id="about">
<h1>About $title</h1>
<p>$description</p>
</section>
<section id="contact">
<h1>Contact</h1>
<p>Contact $title at: $webmaster
</section>
<section id="list">
<h1>The List!</h1>
<p>This is the list of the feeds that we are checking. If you have an RSS reader, grab them and follow along!</p>
$listHTML
</section>
<hr>
<footer>
<p>A footer</p>
<p><a href="https://code.jacobhaddon.com/jake/smhn">Code</a> by Jacob Haddon - license <a href="https://www.gnu.org/licenses/gpl-3.0.en.html">GPLv3.0</a> - <a href="https://Apokrupha.com">Apokrupha.com</a></p>
</footer>
<hr>
</body>
</html>
HTML_END
# write the file
open(FH, '>', $htmlFilePath) or die $!;
print FH $html;
close(FH);
# print $html_header . $html;
###################################
# Functions
###################################
sub formatDate {
my $testDate = $_[0];
$testDate =~ s/((\+|\-)(\d\d\d\d))//;
# Convert to a TIME object
my $t = Time::Piece->strptime($testDate);
return($t->strftime('%a, %d %b %Y %H:%M:%S'));
}
# FIN
###################################
# DATA is list of the feed URLs
###################################
__DATA__
https://nnw.ranchero.com/feed.json
https://feeds.npr.org/1001/rss.xml