REXML で RSS からコンテンツを抜き出して、 URI.extract で URL を抽出するサンプルコード。


#!/usr/bin/env ruby
# coding: utf-8
$KCODE='u'
 
require 'optparse'
require 'net/http'
require "rexml/document"
 
Net::HTTP.version_1_2
 
def get_params()
  opts = {}
  OptionParser.new {|opt|
    opt.on('-f VAL', '--feedurl=VAL') {|v| opts[:feedurl] = v }
    opt.parse!(ARGV)
  }
  return opts
end
 
def get_items(feedurl)
  result = []
  uri = URI.parse(feedurl)
  Net::HTTP.start(uri.host, uri.port){|http|
    res = http.get(uri.request_uri)
    doc = REXML::Document.new(res.body)
    items = doc.get_elements('/rss/channel/item')
    items.each{|item|
      result << {
        :description => item.text('content:encoded'),
      }
    }
  }
  return result
end
 
def get_urls(text)
  return URI.extract(text, ['http', 'https'])
end
 
# main
 
params = get_params()
items = get_items(params[:feedurl])
 
items.each{|item|
  urls = get_urls(item[:description])
  urls.each{|url|
    puts url 
  }
}

実行例。


$ ruby ./extract_urls.rb --feedurl=http://www.nilab.info/nilog/feed.xml | head
http://t.co/fVWMZPQSjr
http://t.co/80vNHnB5K8
http://t.co/nJYmilHP2I
http://t.co/WNlexnBIA6
http://t.co/ZVwLbId8Cq
http://t.co/unsSEW5yjw
http://t.co/4RfluFQKb8
http://t.co/xHYisblBcm
http://t.co/PryqKczAB9
http://t.co/wl8P0NM9KE

参考までに、サンプルに使ったRSSの中身。


$ curl http://www.nilab.info/nilog/feed.xml | head -20
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
  <channel>
    <title>nilog: 無いログは振れない。</title>
    <link>http://www.nilab.info/nilog/</link>
    <description>nilog: 無いログは振れない。</description>
    <language>ja</language>
    <pubDate>Tue, 23 Apr 2013 12:44:32 GMT</pubDate>
    <dc:creator>NI-Lab.</dc:creator>
    <dc:date>2013-04-23T12:44:32Z</dc:date>
    <dc:language>ja</dc:language>
    <item>
      <title>My fitbit #Fitstats for 4/21/2013: 3,133 steps and 2.2 km traveled. http://t.co/fVWMZPQSjr</title>
      <link>http://www.nilab.info/nilog/?type=twitter&amp;id=325988083310919680</link>
      <content:encoded>My fitbit #Fitstats for 4/21/2013: 3,133 steps and 2.2 km traveled. http://t.co/fVWMZPQSjr</content:encoded>
      <pubDate>Sun, 21 Apr 2013 15:03:10 GMT</pubDate>
      <guid>http://www.nilab.info/nilog/?type=twitter&amp;id=325988083310919680</guid>
      <dc:creator>NI-Lab.</dc:creator>
      <dc:date>2013-04-21T15:03:10Z</dc:date>
    </item>

Ref.

tags: ruby url feed

Posted by NI-Lab. (@nilab)