require 'feedjira' require 'httparty' require 'jekyll' require 'nokogiri' require 'time' module ExternalPosts class ExternalPostsGenerator < Jekyll::Generator safe true priority :high def generate(site) if site.config['external_sources'] != nil site.config['external_sources'].each do |src| puts "Fetching external posts from #{src['name']}:" if src['rss_url'] fetch_from_rss(site, src) elsif src['posts'] fetch_from_urls(site, src) end end end end def fetch_from_rss(site, src) xml = HTTParty.get(src['rss_url']).body return if xml.nil? feed = Feedjira.parse(xml) process_entries(site, src, feed.entries) end def process_entries(site, src, entries) entries.each do |e| puts "...fetching #{e.url}" create_document(site, src['name'], e.url, { title: e.title, content: e.content, summary: e.summary, published: e.published }) end end def create_document(site, source_name, url, content) slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') path = site.in_source_dir("_posts/#{slug}.md") doc = Jekyll::Document.new( path, { :site => site, :collection => site.collections['posts'] } ) doc.data['external_source'] = source_name doc.data['title'] = content[:title] doc.data['feed_content'] = content[:content] doc.data['description'] = content[:summary] doc.data['date'] = content[:published] doc.data['redirect'] = url site.collections['posts'].docs << doc end def fetch_from_urls(site, src) src['posts'].each do |post| puts "...fetching #{post['url']}" content = fetch_content_from_url(post['url']) content[:published] = parse_published_date(post['published_date']) create_document(site, src['name'], post['url'], content) end end def parse_published_date(published_date) case published_date when String Time.parse(published_date).utc when Date published_date.to_time.utc else raise "Invalid date format for #{published_date}" end end def fetch_content_from_url(url) html = HTTParty.get(url).body parsed_html = Nokogiri::HTML(html) title = parsed_html.at('head title')&.text || '' description = parsed_html.at('head meta[name="description"]')&.attr('content') || '' body_content = parsed_html.at('body')&.inner_html || '' { title: title, content: body_content, summary: description # Note: The published date is now added in the fetch_from_urls method. } end end end