#!/usr/local/bin/python
# -*- coding: utf8 -*-
import sys
sys.path.append('/home/chikara/lib/python')
import cgi
import cgitb
import user
import ghost
import random
import re
from db import Searcher
cgitb.enable()
def join(str1, str2, link=''):
	l = min(len(str1), len(str2))
	for i in range(l):
		if str1[i:] == str2[:l-i]:
			return str1[:i]+str2
		elif str2[i:] == str1[:l-i]:
			return str2[:i]+str1
	return link.join((str1,str2))

tag_reg = re.compile(r'</?[^>]+?>')
def trim_tag(text):
	return tag_reg.sub('', text).replace('\n', '')

class SearchedFile:
	def __init__(self, fname, string, weight):
		self.__fname = fname
		self.__string = string
		self.weight = weight
	
	def __cmp__(self, other):
		return cmp(self.weight, other.weight)
	
	def get_content(self):
		return self.__fname, self.__string

import math
def search_substr(string, keys, option):
	positions = [string.find(k) for k in keys] # 検索語の文字列中の位置
	if option == 'and' and -1 in positions: raise Exception(option)
	not_contain = positions.count(-1)*100 # 含まれなかったキーワード数で重みつけ
	if not_contain == 0: not_contain = 1
	positions = [i for i in positions if i != -1]
	positions.sort()
	dest = []
	for i in range(len(positions) - 1): # キーワード同士の距離の計算
		dest.append(positions[i+1]-positions[i])
	try:
		mini = min(dest)
		maxi = max(dest)
	except:
		mini = maxi = 1
	weight = mini * math.log(maxi) * 5 * not_contain
	start = min(positions)
	end = max(positions)
	if 20 < end - start <= 30:
		r = string[start:end]
	else:
		r = string[start:start+30]
	return r, weight


def search(key, option):
	files = Searcher('archive').search(key, option)
	if not files : return []
	result = []
	keys = key.split(' ')
	for i in [x.replace('.txt', '.html') for x in files]:
		tmp = trim_tag(unicode(file(i).read().lower()))
		try:
			sub, weight = search_substr(tmp, [s.lower() for s in keys], option)
		except Exception, e:
			pass
		else:
			result.append(SearchedFile(i, sub, weight))
	result.sort()
	return [r.get_content() for r in result]

def main():
	form = cgi.FieldStorage()
	item = {}
	item['result'] = []
	item['s'] = None
	item['query']  = ''
	try:
		q = unicode(form['q'].value.replace(u'　', ' ')).lower()
		r = form['r'].value
	except:
		pass
	else:
		item['query'] = q
		item['s'] = True
		for i in search(q, r):
			item['result'].append({'title': '-'.join(i[0].replace('.html', '').split('/')), 'url': i[0].replace('.html', ''), 'text': i[1]})
		if len(item['result']) == 0:
			del item['s']
			item['n'] = True
	print 'Content-type: text/html;charset=utf-8\n'
	print ghost.parse('search.ghst', item)

if __name__ == '__main__':
	main()
