feedparse.lisp
  1 ;;;
2 ;;; This file is a part of the feedparse project.
3 ;;; Copyright (c) 2014 K. Isom (kyle@tyrfingr.is)
4 ;;;
5
6 (in-package :cl-user)
7 (defpackage feedparse
8 (:use :cl)
9 (:export :parse-feed
10 :parse-feed-string
11 :feed :feed-title :feed-kind :feed-link :feed-items
12 :feed-item :item-id :item-title :item-date :item-link :item-body
13 :*http-request-timeout*))
14 (in-package :feedparse)
15
16 (setf s-xml:*ignore-namespaces* t)
17
18 (defvar *http-request-timeout* 60)
19
20 ;;; `read-file-string` is a utility function to load RSS or Atom feeds
21 ;;; from disk. This is mostly useful in testing, or perhaps for
22 ;;; locally-stored feeds.
23 (defun read-file-string (path)
24 (with-open-file (s path)
25 (let ((data (make-string (file-length s))))
26 (read-sequence data s)
27 data)))
28
29 ;;; A predicate that returns true if the string is a byte string.
30 (defun byte-string-p (str)
31 (and (arrayp str)
32 (every #'integerp str)))
33
34 ;;; A feed contains a title, a kind (either `:RSS` or `:ATOM`), a
35 ;;; string containing the link to the feed's source, and a list of
36 ;;; items retrieved from the feed.
37 (defclass feed ()
38 ((title :initarg :title :reader feed-title :type string
39 :documentation "Return the feed's title.")
40 (kind :initarg :kind :reader feed-kind :type keyword
41 :documentation "Return the feed's kind as a keyword (:RSS
42 or :ATOM).")
43 (link :initarg :link :reader feed-link :type string
44 :documentation "Return the link to the feed's source.")
45 (items :initarg :items :reader feed-items :type list
46 :documentation "Return the list of entries in the feed."))
47 (:documentation "feed is a generic container for storing feed
48 items."))
49
50 ;;; Overloading `describe-object` allows a more useful display of a
51 ;;; feed.
52 (defmethod describe-object ((feed feed) stream)
53 (let ((kind (cond ((eql (feed-kind feed) :rss) "RSS")
54 ((eql (feed-kind feed) :atom) "Atom")
55 (:else "Unknown"))))
56 (format stream "~A (~A feed)~%Link: ~A~%Item count: ~A~%~{ Entry: ~A~%~}~%"
57 (feed-title feed) kind
58 (feed-link feed)
59 (length (feed-items feed))
60 (mapcar #'item-title (feed-items feed)))))
61
62 ;;; A feed item contains the entry's title, date, link, and body text.
63 (defclass feed-item ()
64 ((id :initarg :id :reader item-id
65 :documentation "Return the item's ID.")
66 (title :initarg :title :reader item-title
67 :documentation "Return the item's title.")
68 (date :initarg :date :reader item-date
69 :documentation "Return the date the item was published.")
70 (link :initarg :link :reader item-link
71 :documentation "Return the link pointing to the entry.")
72 (body :initarg :body :reader item-body
73 :documentation "Return the body of the feed entry, as stored
74 in the entry's description field."))
75 (:documentation "A feed item stores a single item in a feed."))
76
77 ;;; `describe-object` is also overloaded for a feed item to provide a
78 ;;; more useful display.
79 (defmethod describe-object ((item feed-item) stream)
80 (format stream "Entry: ~A (published ~A)~%Link: ~A~%Description:~%~A~%"
81 (item-title item) (item-date item)
82 (item-link item) (item-body item)))
83
84 ;;; `make-feed-item` is a utility function that builds a feed from the
85 ;;; title, date, link, and body.
86 (defun make-feed-item (id title date link body)
87 (make-instance 'feed-item
88 :id id
89 :title title
90 :date date
91 :link link
92 :body body))
93
94 ;;; `parser-dispatch` loads the appropriate parser for Atom and RSS
95 ;;; feeds.
96 (defun parser-dispatch (feed-xml)
97 (cond
98 ((assoc :|rss| feed-xml) (parse-rss feed-xml))
99 ((assoc '|feed| feed-xml) (parse-atom feed-xml))
100 ((assoc :|feed| feed-xml) (parse-atom feed-xml))
101 (:else (error "Unknown feed type."))))
102
103 ;;; This should really be part of drakma, but for now we use it as a
104 ;;; drop-in wrapper.
105 (define-condition request-timeout-error (error)
106 ((url :initarg :url
107 :reader request-timeout-error-url))
108 (:report (lambda (condition stream)
109 (format stream "Grabbing URL ~a timed out."
110 (request-timeout-error-url condition)))))
111
112 (defun http-request-thread (url timeout)
113 (handler-case
114 (drakma:http-request url :connection-timeout timeout)
115 ;; Note: this will have to be revisited once we have a sane view of
116 ;; all the conditions returned by drakma.
117 (t (c)
118 (return-from http-request-thread (values nil c)))))
119
120 (defun http-request-with-timeout (url timeout)
121 #+:sb-thread
122 (let ((req-thread (sb-thread:make-thread
123 #'http-request-thread
124 :name "feedparse-request-thread"
125 :arguments (list url timeout))))
126 (multiple-value-bind (value result)
127 (sb-thread:join-thread req-thread :default nil :timeout timeout)
128 (when (eq result :timeout)
129 (when (sb-thread:thread-alive-p req-thread)
130 (sb-thread:terminate-thread req-thread))
131 (error 'request-timeout-error :url url))
132 value))
133 #-:sb-thread
134 (error "Unsupported Lisp implementation for HTTP requests with timeout."))
135
136 ;;; `parse-feed` downloads the feed as a byte string and passes it off
137 ;;; to be parsed.
138 (defun parse-feed (url)
139 "Fetch the feed described in URL over HTTP, and parse the
140 feed. Returns a feed object."
141 (parse-feed-string (http-request-with-timeout url *http-request-timeout*)))
142
143 ;;; In order to parse a string containing a feed, it must be parsed as
144 ;;; XML and then handed off to `parser-dispatch`.
145 (defun parse-feed-string (str)
146 "Parse the feed stored in the string or array of octets, returning a
147 feed object."
148 (parser-dispatch
149 (s-xml:parse-xml-string
150 (cond
151 ((stringp str) str)
152 ((byte-string-p str) (flexi-streams:octets-to-string str))
153 (:else (error "Unknown content-type."))))))
154
155 ;;; Just give me the XML. This is useful in SLiME to understand the
156 ;;; structure of the data.
157 (defun parse-xml (str)
158 (s-xml:parse-xml-string
159 (cond
160 ((stringp str) str)
161 ((byte-string-p str) (flexi-streams:octets-to-string str))
162 (:else (error "Unknown content-type.")))))
163
164 ;;; Let's take a look-see at this here XML. Or whatever.
165 (defun pretty-print-xml (url)
166 (let ((str (http-request-with-timeout url 60)))
167 (pprint
168 (s-xml:parse-xml-string
169 (cond
170 ((stringp str) str)
171 ((byte-string-p str) (flexi-streams:octets-to-string str))
172 (:else (error "Unknown content-type.")))))))
173
174 (defun get-xml-element (element item)
175 (second (assoc element item
176 :test #'(lambda (item x)
177 (cond
178 ((symbolp x) (eq x item))
179 ((listp x) (eq item (car x)))
180 (t nil))))))
181
182 (defun parse-rss (feed-xml)
183 (let ((channel (cdr (assoc :|channel| feed-xml))))
184 (make-instance 'feed
185 :title (get-xml-element :|title| channel)
186 :kind :rss
187 :link (get-xml-element :|link| channel)
188 :items (extract-rss-items channel))))
189
190 (defun parse-rss-item (item)
191 (make-feed-item (get-xml-element :|guid| item)
192 (get-xml-element :|title| item)
193 (get-xml-element :|pubDate| item)
194 (get-xml-element :|link| item)
195 (get-xml-element :|description| item)))
196
197 (defun build-rss-item-list (channel)
198 (mapcar #'cdr
199 (remove-if-not (lambda (elt)
200 (and (listp elt)
201 (equal (first elt) :|item|)))
202 channel)))
203
204 (defun extract-rss-items (channel)
205 (mapcar #'parse-rss-item
206 (build-rss-item-list channel)))
207
208
209 (defun build-atom-item-list (feed-xml)
210 (mapcar #'cdr
211 (remove-if-not (lambda (elt)
212 (equal (first elt) :|entry|))
213 feed-xml)))
214
215 (defun atom-entry-p (elt)
216 (let ((tag (first elt)))
217 (cond
218 ((symbolp tag) (equalp tag :|entry|))
219 ((listp tag) (equalp (first tag) :|entry|))
220 (t nil))))
221
222 (defun get-atom-body (entry)
223 (let ((summary (get-xml-element :|summary| entry)))
224 (if (null summary) "" summary)))
225
226 (defmacro head-is-x (fn target)
227 `(defun ,fn (elt)
228 (when (listp elt)
229 (equalp (first elt) ,target))))
230
231 (head-is-x head-is-linkp :|link|)
232 ; (every-x-is-eltp head-is-linkp :|link|)
233
234 (defmacro defmatch-atom-elt (fn pred)
235 `(defun ,fn (elt)
236 (and (listp elt)
237 (listp (first elt))
238 (every ,pred elt))))
239
240 (defmatch-atom-elt match-atom-link #'head-is-linkp)
241
242 (defun extract-atom-entry-title (entry)
243 (labels ((title-candidatep (elt)
244 (when (and (listp elt)
245 (listp (first elt)))
246 (first elt)))
247 (titlep (elt)
248 (equalp :|title| (first (first elt)))))
249 (let ((candidates (remove-if-not #'title-candidatep entry)))
250 (second (first (remove-if-not #'titlep candidates))))))
251
252 (defun get-atom-title (entry)
253 (let ((title (get-xml-element :|title| entry)))
254 (if (null title)
255 (extract-atom-entry-title entry)
256 title)))
257
258 (defun get-atom-link (entry)
259 (let* ((links (remove-if-not #'match-atom-link entry))
260 (alt-links (remove-if-not
261 #'(lambda (link)
262 (let ((link-rel (getf (rest (first link))
263 :|rel|)))
264 (or (not link-rel)
265 (string= link-rel "alternate"))))
266 links)))
267 (when alt-links
268 (getf (rest (caar alt-links)) :|href|))))
269
270 (defun build-atom-entry (entry)
271 (let* ((body (rest entry)))
272 (make-feed-item (get-xml-element :|id| body)
273 (get-atom-title body)
274 (get-xml-element :|published| body)
275 (get-atom-link body)
276 (get-xml-element :|summary| body))))
277
278 (defun extract-first-link (feed-xml)
279 (let ((pos (position-if #'match-atom-link feed-xml)))
280 (unless (null pos)
281 (getf
282 (rest (first (nth pos feed-xml)))
283 :|href|))))
284
285 (defun parse-atom (feed-xml)
286 (make-instance 'feed
287 :title (get-xml-element :|title| feed-xml)
288 :kind :atom
289 :link (extract-first-link feed-xml)
290 :items (mapcar #'build-atom-entry
291 (remove-if-not #'atom-entry-p
292 (cdr feed-xml)))))