smhn/smhnRSS.pl

407 lines
13 KiB
Perl
Raw Normal View History

#!/usr/bin/perl
# use cPanelUserConfig; #for cpanel servers
use 5.010;
use strict;
use warnings;
###################################
#
# The Sunday Morning Horror News
#
# This script pulls from a list of RSS feeds and agregates them together into a web page.
2024-03-07 14:28:03 -05:00
# It is designed to run as a cron and overwrite the HTML file.
#
# license GPLv3.0 https://www.gnu.org/licenses/gpl-3.0.en.html
# Code repository: https://code.jacobhaddon.com/jake/smhn
# Written by Jacob Haddon https://jacobhaddon.com
#
###################################
# Packages
use Time::Piece; # https://perldoc.perl.org/Time::Piece
use Time::Seconds; # https://perldoc.perl.org/Time::Seconds
use LWP::Simple; # https://metacpan.org/pod/LWP::Simple
use XML::RSS; # https://metacpan.org/pod/XML::RSS
use HTML::Entities; # https://metacpan.org/pod/HTML::Entities
# server file folders
# my $rssFilePath = "/home/USER_FOLDER/public_html/feed.xml";
# my $htmlFilePath = "/home/USER_FOLDER/public_html/index.html";
# my $errorFilePath = "/home/USER_FOLDER/public_html/feed.log";
# local file folders
my $rssFilePath = "feed.xml";
my $htmlFilePath = "index.html";
my $errorFilePath = "feed.log";
###################################
# RSS Configurations
###################################
my $title = "Sunday Morning Horror News";
my $homeLink = "http://sundaymorninghorrornews.com";
my $feedLink = "http://sundaymorninghorrornews.com/feed.xml";
my $description = 'A collection of horror writer, editor, poet and publisher blogs from all over the web!';
my $webmaster = 'webmaster@sundaymorninghorrornews.com';
my $copyright = 'Copyright respective writers';
###################################
# Go through list of URLs, get RSS feed,
# take newest 3 that are less than $then old,
# add to new RSS feed object
###################################
# number of weeks in the past to hold RSS feed
my $num_weeks = 2;
# get today, subtact time to make cut off
my $now = localtime;
my $then = $now - (ONE_WEEK * $num_weeks);
#number of items to keep from each feed
my $number_of_items = 2; # +1 since everything starts at 0
#list to hold the new RSS items
my %list;
# Make the list of URLS while parsing DATA
my $listHTML = "<div class=\'listHTML\'><ul>\n";
# Make a list of URLs that have an error
my $listURLError = "The following feeds had issues this time:\n\n" . $now->strftime('%a, %d %b %Y %H:%M:%S %z'). "\n\n";
# Go through each URL in the DATA section and make the new list
while ( my $url = <DATA>) {
chomp $url;
# get the XML from the URL
my $ua = LWP::UserAgent->new;
$ua->default_header('User-Agent' => 'Mozilla/5.0');
my $resp = $ua->get($url) or next;
my $xml = $resp->decoded_content;
# parse the XML
my $rss1 = XML::RSS->new;
eval { $rss1->parse( $xml ) };
# if empty, add URL to log file variable
if ($@) {
$listURLError .= "* " . $url . "\n";
next;
}; # if $@
# go through the items from the XML
for (my $j = 0; $j <= $number_of_items; $j++){
my $testItem = @{ $rss1->{'items'}}[$j];
# Get the pub date of the article
my $testDate = $testItem->{'pubDate'};
# Strip out the milliseconds
$testDate =~ s/((\+|\-)(\d\d\d\d))//;
# Convert to a TIME object
my $t = Time::Piece->strptime($testDate);
# If the post is older than one year, do not include it
if ($t->epoch > $then->epoch) {
$testItem->{'feedURL'} = $_;
$testItem->{'feedName'} = $rss1->{'channel'}{'title'};
# Find Author tags
if ($testItem->{'dc'}{'creator'}) {
$testItem->{'itemAuthor'} = $testItem->{'dc'}{'creator'};
} elsif ($testItem->{'author'}) {
$testItem->{'itemAuthor'} = $testItem->{'author'}
} else {
$testItem->{'itemAuthor'} = $rss1->{'channel'}{'title'};
} # if author
# Clean up some of the artifacts in the RSS feed 'description' section
$testItem->{'description'} =~ s/\n\s*/\n/g; # get rid of excess white space
$testItem->{'description'} =~ s/(<a(.+?)<\/a>)$//s; # link at end of description
$testItem->{'description'} =~ s/<(\w) class=(.*?)>/<$1>/s; # remove class statements from text
$testItem->{'description'} =~ s/<figure ((.|\n)+?)figure>//sg; #remove "figure" can use pipe to add more
$testItem->{'description'} =~ s/<img(.+?)>//sg; # remove IMG tags
$testItem->{'description'} =~ s/<span ((.|\n)+?)>//sg; #remove "span" tags (mostly blogger)
$testItem->{'description'} =~ s/<\/span>//sg; #remove "span" endtags
$testItem->{'description'} =~ s/<div class="separator" style(.+?)<\/div>//sg; # remove blogger DIV tags
$testItem->{'description'} =~ s/<br(.+?)>/<br>/sg; # remove blogger BR tags
$testItem->{'description'} =~ s/(<div><br><\/div>)+/<br>/sg; # remove blogger BR + DIV tags
$testItem->{'description'} =~ s/.{1200}\K.*//s; # limit length
$testItem->{'description'} =~ s/<\/?div.*?>//sg; # remove div tags
$testItem->{'description'} =~ s/(<a[^<]+)$//s; # link at end of description
#add continue reading to end.
$testItem->{'description'} .= " <a href=\'" . $testItem->{'link'} . "\' target=\'_blank\'>Continue Reading</a>";
# Set Epoch as KEY so easier to arrange in order
$list{$t->epoch} = $testItem;
} #if epoch
} # for i loop
# add to the list of URL HTML
$listHTML .= "<li>" . $url . "</li>\n";
} # while DATA
#close out the list URL html
$listHTML .= "</ul></div>\n";
###################################
# Write the error file
###################################
open(FH, '>', $errorFilePath) or die $!;
print FH $listURLError;
close(FH);
###################################
# Make an RSS Feed!
###################################
# date format: Thu, 28 Dec 2023 03:51:42
# $now->strftime("%a, %d %b %Y %H:%M:%S %z");
my $rss2 = XML::RSS->new (version => '2.0');
$rss2->add_module(prefix => 'atom', uri => 'http://www.w3.org/2005/Atom');
$rss2->channel(title => $title,
link => $homeLink,
language => 'en-US',
description => $description,
copyright => $copyright,
pubDate => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
lastBuildDate => $now->strftime('%a, %d %b %Y %H:%M:%S %z'),
webMaster => $webmaster,
atom => { 'link' => { 'href' => $feedLink, 'rel' => 'self', 'type' => 'application/rss+xml' } }
); # $rss->channel
# foreach ITEM, newest (highest EPOCH) first
foreach my $name (reverse sort keys %list) {
$rss2->add_item(title => $list{$name}->{'title'},
permaLink => $list{$name}->{'link'},
link => $list{$name}->{'link'},
description => $list{$name}->{'description'},
pubDate => $list{$name}->{'pubDate'},
author => $list{$name}->{'itemAuthor'},
source => $list{$name}->{'feedName'},
sourceUrl => $list{$name}->{'feedURL'},
); # $rss->channel
} # foreach
# Save the RSS feed as a file
$rss2->save($rssFilePath);
###################################
# Format the RSS data for HTML
###################################
# make the HTML for the processed RSS
my $rssHTML = "<div class=\'rssHTML'>\n";
# print the title and link of each RSS item
foreach my $item (@{$rss2->{'items'}}) {
$rssHTML .= "<h2 class=\'headline\'><a href='" . $item->{'link'} . "' target=\'_blank\'>" . encode_entities($item->{'title'}) . "</a></h2>\n\n";
$rssHTML .= "<div class=\'byline\'>" . encode_entities($item->{'author'}) . " - " . formatDate($item->{'pubDate'}) ."</div>\n\n";
$rssHTML .= "<div class=\'description\'>" . $item->{'description'} . "</div>\n\n";
} # foreach item
# close out the rssHTML
$rssHTML .= "</div>\n\n";
# the webpage HTML
# format the pubDate
my $printDate = formatDate($rss2->{'channel'}{'pubDate'});
# header for a direct HTML post
my $html_header = "Status: 200\nContent-type: text/html\n\n";
###################################
# Make the HTML Page
###################################
my $html = <<"HTML_END";
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>The Sunday Morning Horror News</title>
<style>
body{
margin:40px auto;
max-width:650px;
2024-02-21 21:06:13 -05:00
font: 18px/1.6 Baskerville, TimesNewRoman, Times New Roman, Times, Georgia, serif;
padding:0 10px:
word-break: break-word;
}
h1, h2, h3 {font-weight: 200;}
hr{width:50%;}
.byline{font-style:italic;}
nav{
font-size:20px;
text-align: center;
}
a {text-decoration:none;}
a:hover {text-decoration:underline;}
/* Dark Mode Colors */
\@media screen and (prefers-color-scheme: dark) {
body{
color: white;
background-color: black;
}
a {color:orange;}
a:visited{color:yellow;}
} /* media screen */
</style>
</head>
<body>
<header>
<h1>$title</h1>
<p>$description</p>
<p>This site is in BETA and will update daily as we test things out and add URLs. Let us know if there are sites that should be included!</p>
<p>updated: $printDate</p>
</header>
<nav>
<a href="#news">News</a> - <a href="#about">About</a> - <a href="#contact">Contact</a> - <a href="#list">List</a> - <a href="$feedLink">RSS Feed</a>
</nav>
<section id="news">
<h1>The News!</h1>
$rssHTML
</section>
<section id="about">
<h1>About $title</h1>
<p>The $title is an aggregator of RSS feeds from writers, poets and publishers.</p>
<p>Too often the world is flying by at the speed of social media. $title is designed to update just once a week (on Sunday!) so you can take your time and catch up on the news without having to scroll.</p>
</section>
<section id="contact">
<h1>Contact</h1>
<p>Contact $title at: $webmaster
<p>We are intolerant of intolerance. If you see something hateful, transphobic, homophobic, racist, or the like, let us know.</p>
<p>We don't publish the content you see here, just collect it, but we don't need to collect trash.</p>
</section>
<section id="list">
<h1>The List!</h1>
<p>This is the list of the feeds that we are checking. If you have an RSS reader, grab them and follow along!</p>
$listHTML
</section>
<hr>
<footer>
<p>This news feed made with <a href="https://www.perl.org/">Perl</a> and <a href="https://www.barebones.com/products/bbedit/">BBEdit</a>. Best viewable in Netscape Navigator 4 or higher</p>
<p>Open web features like RSS make sites like this possible. Support the open web!</p>
<p>this is a <a href="http://motherfuckingwebsite.com/">motherfucking website</a> and a <a href="http://bettermotherfuckingwebsite.com/">better motherfucking</a> website.</p>
<p><a href="https://code.jacobhaddon.com/jake/smhn">Code</a> by Jacob Haddon - license <a href="https://www.gnu.org/licenses/gpl-3.0.en.html">GPLv3.0</a> - <a href="https://Apokrupha.com">Apokrupha.com</a></p>
</footer>
<hr>
</body>
</html>
HTML_END
# write the file
open(FH, '>', $htmlFilePath) or die $!;
print FH $html;
close(FH);
# print $html_header . $html;
###################################
# Functions
###################################
sub formatDate {
# takes a date, and converts it to a formatted string
my $testDate = $_[0];
$testDate =~ s/((\+|\-)(\d\d\d\d))//;
# Convert to a TIME object
my $t = Time::Piece->strptime($testDate);
return($t->strftime('%a, %d %b %Y %H:%M:%S'));
} # sub formatDate
# FIN
###################################
# DATA is list of the feed URLs
###################################
__DATA__
https://ecatherine.com/feed/
https://redlagoe.com/feed/
https://diebooth.wordpress.com/feed/
https://kelliowen.com/feed/
https://mercedesmyardley.com/feed/
https://mehitobel.com/feed/
http://antoncancre.blogspot.com/feeds/posts/default?alt=rss
https://marysangi.wordpress.com/feed/
https://www.toddkeisling.com/news?format=rss
https://www.jfgonzalez.org/news?format=rss
https://www.briankeene.com/news?format=rss
https://www.cemeterydance.com/extras/feed/
https://www.gwendolynkiste.com/Blog/feed/
https://katherinesilvaauthor.substack.com/feed
https://queenofswordspress.com/feed/
http://fiendlover.blogspot.com/feeds/posts/default
http://jacobhaddon.com/feed/
http://apokrupha.com/feed/
https://ellendatlow.com/feed/
https://paulaguran.com/feed/
https://amandaheadlee.com/feed/
https://theimbloglio.wordpress.com/feed/
https://kennethwcain.com/feed/
https://wellwortharead.blogspot.com/feeds/posts/default?alt=rss
http://robertfordauthor.com/feed/
https://tinyfrights.com/feed/
https://weightlessbooks.com/feed/
https://www.crystallakepub.com/feed/
https://lynnehansen.zenfolio.com/blog.rss
https://www.bevvincent.com/feed/
http://liviallewellyn.com/feed/
https://www.kristidemeester.com/blog-feed.xml
https://www.lucysnyder.com/index.php/feed/
https://www.emilyruthverona.com/blog-feed.xml
https://www.elizabethhand.com/welcome?format=rss
https://www.jamielackey.com/feed/
https://cv-hunt.com/feed/
https://authorjenniferallisprovost.com/feed/
https://jezzywolfe.wordpress.com/feed/
https://lmariewood.com/feed/
https://www.leemurray.info/blog-feed.xml
https://meghanarcuri.com/feed/
https://nicolecushing.wordpress.com/feed/
https://saratantlinger.com/feed/
https://sunnymoraine.com/feed/
https://lauramauro.com/feed/