Enable specifying explicit list of external posts to display (#2059)

- updates `external-posts.rb` plugin, allowing the user to specify an explicit lists of urls in `_config.yml` that are then displayed in the blog feed as external posts - 99% of the code in this change is written by gpt-4: https://chat.openai.com/share/24432d24-36a7-4d6f-a5c0-d7e5142f68cd
2024-05-27 20:15:44 -04:00 · 2024-05-27 20:15:44 -04:00 · d004837e60
parent 1274581702
commit d004837e60
2 changed files with 83 additions and 19 deletions
--- a/_config.yml
+++ b/_config.yml
@ -133,7 +133,7 @@ bing_site_verification: # out your bing-site-verification ID (Bing Webmaster)
 blog_name: al-folio # blog_name will be displayed in your blog page
 blog_description: a simple whitespace theme for academics
 permalink: /blog/:year/:title/
-lsi: true # produce an index for related posts
+lsi: false # produce an index for related posts

 # Pagination
 pagination:
@ -168,6 +168,10 @@ disqus_shortname: al-folio # put your disqus shortname
 external_sources:
  - name: medium.com
    rss_url: https://medium.com/@al-folio/feed
+  - name: Google Blog
+    posts:
+      - url: https://blog.google/technology/ai/google-gemini-update-flash-ai-assistant-io-2024/
+        published_date: 2024-05-14

 # -----------------------------------------------------------------------------
 # Collections
--- a/_plugins/external-posts.rb
+++ b/_plugins/external-posts.rb
@ -1,6 +1,8 @@
 require 'feedjira'
 require 'httparty'
 require 'jekyll'
+require 'nokogiri'
+require 'time'

 module ExternalPosts
  class ExternalPostsGenerator < Jekyll::Generator
@ -10,27 +12,85 @@ module ExternalPosts
    def generate(site)
      if site.config['external_sources'] != nil
        site.config['external_sources'].each do |src|
-          p "Fetching external posts from #{src['name']}:"
-          xml = HTTParty.get(src['rss_url']).body
-          return if xml.nil?
-          feed = Feedjira.parse(xml)
-          feed.entries.each do |e|
-            p "...fetching #{e.url}"
-            slug = e.title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
-            path = site.in_source_dir("_posts/#{slug}.md")
-            doc = Jekyll::Document.new(
-              path, { :site => site, :collection => site.collections['posts'] }
-            )
-            doc.data['external_source'] = src['name'];
-            doc.data['feed_content'] = e.content;
-            doc.data['title'] = "#{e.title}";
-            doc.data['description'] = e.summary;
-            doc.data['date'] = e.published;
-            doc.data['redirect'] = e.url;
-            site.collections['posts'].docs << doc
+          puts "Fetching external posts from #{src['name']}:"
+          if src['rss_url']
+            fetch_from_rss(site, src)
+          elsif src['posts']
+            fetch_from_urls(site, src)
          end
        end
      end
    end
+
+    def fetch_from_rss(site, src)
+      xml = HTTParty.get(src['rss_url']).body
+      return if xml.nil?
+      feed = Feedjira.parse(xml)
+      process_entries(site, src, feed.entries)
+    end
+
+    def process_entries(site, src, entries)
+      entries.each do |e|
+        puts "...fetching #{e.url}"
+        create_document(site, src['name'], e.url, {
+          title: e.title,
+          content: e.content,
+          summary: e.summary,
+          published: e.published
+        })
+      end
+    end
+
+    def create_document(site, source_name, url, content)
+      slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
+      path = site.in_source_dir("_posts/#{slug}.md")
+      doc = Jekyll::Document.new(
+        path, { :site => site, :collection => site.collections['posts'] }
+      )
+      doc.data['external_source'] = source_name
+      doc.data['title'] = content[:title]
+      doc.data['feed_content'] = content[:content]
+      doc.data['description'] = content[:summary]
+      doc.data['date'] = content[:published]
+      doc.data['redirect'] = url
+      site.collections['posts'].docs << doc
+    end
+
+    def fetch_from_urls(site, src)
+      src['posts'].each do |post|
+        puts "...fetching #{post['url']}"
+        content = fetch_content_from_url(post['url'])
+        content[:published] = parse_published_date(post['published_date'])
+        create_document(site, src['name'], post['url'], content)
+      end
+    end
+
+    def parse_published_date(published_date)
+      case published_date
+      when String
+        Time.parse(published_date).utc
+      when Date
+        published_date.to_time.utc
+      else
+        raise "Invalid date format for #{published_date}"
+      end
+    end
+
+    def fetch_content_from_url(url)
+      html = HTTParty.get(url).body
+      parsed_html = Nokogiri::HTML(html)
+
+      title = parsed_html.at('head title')&.text || ''
+      description = parsed_html.at('head meta[name="description"]')&.attr('content') || ''
+      body_content = parsed_html.at('body')&.inner_html || ''
+
+      {
+        title: title,
+        content: body_content,
+        summary: description
+        # Note: The published date is now added in the fetch_from_urls method.
+      }
+    end
+
  end
 end