QQWry.Dat不是普通的txt文件,所以python解析要特定的程序。下面是一个Python类
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# author : firefoxbug
# E-Mail : wanghuafire@gmail.com
# Blog : www.firefoxbug.net
import sys
import socket
from struct import pack, unpack
class IPInfo(object):
'''QQWry.Dat数据库查询功能集合
'''
def __init__(self, dbname):
self.dbname = dbname
f = file(dbname, 'r')
self.img = f.read()
f.close()
(self.firstIndex, self.lastIndex) = unpack('II', self.img[:8])
self.indexCount = (self.lastIndex - self.firstIndex) / 7 + 1
def getString(self, offset = 0):
o2 = self.img.find('\0', offset)
gb2312_str = self.img[offset:o2]
try:
utf8_str = unicode(gb2312_str,'gb2312').encode('utf-8')
except:
return '未知'
return utf8_str
def getLong3(self, offset = 0):
s = self.img[offset: offset + 3]
s += '\0'
return unpack('I', s)[0]
def getAreaAddr(self, offset = 0):
''' 通过给出偏移值,取得区域信息字符串,'''
byte = ord(self.img[offset])
if byte == 1 or byte == 2:
p = self.getLong3(offset + 1)
return self.getAreaAddr(p)
else:
return self.getString(offset)
def getAddr(self, offset, ip = 0):
img = self.img
o = offset
byte = ord(img[o])
if byte == 1:
return self.getAddr(self.getLong3(o + 1))
if byte == 2:
cArea = self.getAreaAddr(self.getLong3(o + 1))
o += 4
aArea = self.getAreaAddr(o)
return (cArea, aArea)
if byte != 1 and byte != 2:
cArea = self.getString(o)
o = self.img.find('\0',o) + 1
aArea = self.getString(o)
return (cArea, aArea)
def find(self, ip, l, r):
''' 使用二分法查找网络字节编码的IP地址的索引记录'''
if r - l <= 1:
return l
m = (l + r) / 2
o = self.firstIndex + m * 7
new_ip = unpack('I', self.img[o: o+4])[0]
if ip <= new_ip:
return self.find(ip, l, m)
else:
return self.find(ip, m, r)
def getIPAddr(self, ip):
ip = unpack('!I', socket.inet_aton(ip))[0]
i = self.find(ip, 0, self.indexCount - 1)
o = self.firstIndex + i * 7
o2 = self.getLong3(o + 4)
(c, a) = self.getAddr(o2 + 4)
return (c, a)
def output(self, first, last):
for i in range(first, last):
o = self.firstIndex + i * 7
ip = socket.inet_ntoa(pack('!I', unpack('I', self.img[o:o+4])[0]))
offset = self.getLong3(o + 4)
(c, a) = self.getAddr(offset + 4)
print "%s %d %s/%s" % (ip, offset, c, a)
'''search ip and its'localtion '''
def search_ip(ip_instance,ipaddr):
if ipaddr :
try :
(c, a) = ip_instance.getIPAddr(ipaddr)
print '%s/%s'%(c,a)
except :
print 'Unknow/Unknow'
if __name__ == '__main__':
ip_instance = IPInfo('./QQWry.Dat')
search_ip(ip_instance,"8.8.8.8")
纯真库优势就是可以精确到公司,街道,网吧,但是有麻烦的一点就是统计起来不方便,因为都是一个字符串,需要再做分词。