Diary?::2005-09-19

今日も現実逃避の一環として細かいスクリプトを作ってみた。名付けてNotfounder。やってることといえば、ページ内のデッドリンクなどを探すだけ。

#!/usr/bin/env python
from urllib2 import urlopen, HTTPError
from urlparse import urlparse, urljoin
from htmllib import HTMLParser
from formatter import AbstractFormatter, NullWriter

Parser = HTMLParser(AbstractFormatter(NullWriter()))

def notfounder(url, no_inner=False):
	print "Connecting",url,"..."
	res = urlopen(url)
	_, host, _, _, _, _ = urlparse(url)
	if res.code != 200: raise Exception("%s %s %s" % (url, str(res.code), res.msg))
	Parser.feed(res.read())
	for i in Parser.anchorlist:
		if "http://" != i[:7]:
			i = urljoin(url, i)
		if no_inner == True and host in i:
			continue
		try:
			r = urlopen(i)
		except HTTPError, e:
			yield i, e.code, e.msg
		else:
			yield i, r.code, r.msg
	
import sys
from getopt import getopt
def main():
	opts, args = getopt(sys.argv[1:], "n")
	inner = True
	for o, v in opts:
		if o == "-n":
			innder = False
	for i in args:
		try:
			res = notfounder(i, inner)
		except Exception, e:
			print e
		else:
			try:
				for a, b, c in res:
					print a, b, c
			except Exception, e:
				print e
	
if __name__ == "__main__":
	main()

使い方はコマンドラインから

notfounder [-n] URL...

-nオプションを指定した場合、引数として与えたURLと同じドメイン内のページは探索しないようになる。

返ってくるエラーの種類によっては途中で止まるだろうけど、まぁ気にしない方向で。

上記のスクリプトの名前、Notfounder以外にもDeadLinkFounderとかFounderOfNotFoundとか
思い付いたけど、結局一番短いNotfounderに決定。一番言葉で遊んでる感じがするしね。