# Copyright 1999 taliesin from shortcuts import * import re, string #--- LATIN-1 Vuml = ('ä', 'ï', 'ë', 'ö', 'ü', 'ÿ') Vgrave = ('à', 'ì', 'è', 'ò', 'ù') Vacute = ('á', 'í', 'é', 'ó', 'ú', 'ý') Vcirc = ('â', 'î', 'ê', 'ô', 'û') Vnas = ('ã', 'õ') Vdia = Vuml + Vgrave + Vacute + Vcirc + Vnas """ max = n counted = list of 1 to n dicts analysed = list of 3 dicts type = 0 to n-1 for counted, -1 for analysed """ class freq: def __init__(self, data, phonemes=[], ignore=(), max=3): assert data if ignore: self.ignore = ignore ignores = re.compile('[' + string.join(ignore, '') + ']') data = ignores.sub('', data) if not phonemes: for i in range(3): phonemes.append(map(None, string.lowercase)) self.phonemes = phonemes for type in range(3): self.phonemes[type].sort(lambda x, y: cmp(len(y), len(x))) self.max = max self.countedsum = [] self.countedstats = [] counted = [] for i in range(self.max): self.countedsum.append(0) self.countedstats.append([]) counted.append({}) counted = self.counter(counted, data) self.counted = counted self.zerofound = [[], [], []] self.analysedsum = [0, 0, 0] self.analysedstats = [[], [], []] analysed = [{}, {}, {}] analysed = self.analyser(analysed, data) self.analysed = analysed self.makestats() def counter(self, counted, data, replace=string.replace): pos = 0 while pos < len(data): if data[pos] != '\000': for type in range(self.max): if pos >= type: entry = data[pos-type:pos+1] if '\000' in entry: break entry = replace(entry, '\000', '') if entry: counted[type][entry] = counted[type].get(entry, 0) + 1 pos = pos + 1 self.counted = counted return counted def analyser(self, analysed, data): assert len(self.phonemes) == 3 type = 0 zerofound = self.zerofound while type < 3: phonemes = self.phonemes[type] rawdata = data[:] for phoneme in phonemes: rawletters = string.split(rawdata, phoneme) if type == 1: phoneme = phoneme[1:] if type == 2: phoneme = phoneme[:-1] rawdata = string.join(rawletters, '') phonemecount = len(rawletters) - 1 if phonemecount: analysed[type][phoneme] = \ analysed[type].get(phoneme, 0) + phonemecount else: zerofound[type].append(phoneme) if not rawdata: break type = type + 1 self.zerofound = zerofound self.analysed = analysed return analysed def makestats(self): def statit(self, dictlist, sums): assert dictlist and sums i = 0 statlist = [] while i < len(sums): dict = dictlist[i] assert dict keys = dict.keys() letters = reduce(inc, dict.values()) stats = [] for key in keys: stats.append(float(dict[key])/letters*100.0, dict[key], key) sums[i] = letters statlist.append(stats) i = i + 1 return statlist, sums stats, sums = statit(self, self.counted, self.countedstats) self.countedstats = stats self.countedsum = sums stats, sums = statit(self, self.analysed, self.analysedstats) self.analysedstats = stats self.analysedsum = sums def printstats(self, list=[], type=0, format='perc', sort='0-9', break_at=65536, ana=0): def printit(list, format="<%3s> %4.1f%%", order=(2,0), break_at=break_at): i = 0 printorder = [] for entry in list: i = i + 1 print format % (entry[order[0]], entry[order[1]]) if i >= break_at: break "list: [(%f, %u, %s)]" assert list list = list[type] print 'Letter frequencies:' if format == 'num': printformat = "%7u <%3s>" list = map(lambda (x, y, z): (y, x, z), list) order = (0, 2) elif format == 'perc': printformat = "<%3s> %4.1f%%" order = (2,0) elif format == 'both': pass if sort == '0-9': list.sort() list.reverse() #print 'Processing:', list printit(list, printformat, order) elif sort == 'a-z': list = map(lambda (x, y, z): (z, x, y), list) list.sort() printit(list, order=(0,1)) if ana: print 'Not found:' for t in self.zerofound[type]: print t,