#!/usr/bin/python -Wall # ================================================================ # John Kerl # kerl.john.r@gmail.com # 2009-07-15 # Please see http://math.arizona.edu/~kerl/randspell/randspell.html # for more information. # ================================================================ from __future__ import division # 11/4 = 2.75, not 2. import sys import re import copy import random import pickle # ---------------------------------------------------------------- def find_word_in_line(orig_line): line = copy.copy(orig_line) # Strip trailing carriage return, if any. if line[-1] == '\n': line = line[0:-1] # Strip comments. line = re.sub(r"#.*", r"", line) # Strip leading and trailing whitespace. line = re.sub(r"^\s+", r"", line) line = re.sub(r"\s+$", r"", line) # Skip blank lines. if re.match(r"^$", line): return [0, ""] # Skip words with any upper-case letters. #if re.match(r"[A-Z]", line): # return [0, ""] return [1, line] # ---------------------------------------------------------------- # Increments by one, if key is present, else does an insert. def hash_count_increment(hash, key): if key in hash: hash[key] += 1 else: hash[key] = 1 # ---------------------------------------------------------------- # Array of pairs of [value, cumulative probability]. def hash_to_CDF(prb_hash): CDF = [] C = 0.0 for key in prb_hash: P = prb_hash[key] C += P CDF.append([key, C]) CDF[-1][1] = 1.0 # Fix up roundoff error. return CDF # ---------------------------------------------------------------- def random_selection_from_CDF(CDF): U = random.uniform(0.0, 1.0) for [value, C] in CDF: if U < C: return value print >> sys.stderr, 'random_selection_from_CDF b0rk' sys.exit(1) # ---------------------------------------------------------------- # Note: letters in words are indexed from zero up. E.g. 'e' is letter 4 # of the word 'apple'. # ---------------------------------------------------------------- def populate_ell_and_M_histos(word_list_file_name): ells_hash = {} M_hashes = {} try: fp = open(word_list_file_name, 'ra') except: print >> sys.stderr, "Couldn't open \"%s\" for read.\n" \ % (word_list_file_name) sys.exit(1) num_words = 0 max_word_len = 0 while 1: line = fp.readline() if line == "": break [is_word, word] = find_word_in_line(line) if not is_word: continue num_words += 1 ell = len(word) if ell > max_word_len: max_word_len = ell if (num_words % 10000) == 0: print '... %7d words' % (num_words) # Update the histogram of word lengths. hash_count_increment(ells_hash, ell) # Update the histogram of transition probabilities. # If this is the first word of length ell, set M_hashes[ell] # to be an array from 0 to ell-1 of empty hashes. if ell not in M_hashes: entry = [] for k in range(0, ell): entry.append({}) M_hashes[ell] = entry # Histogram for letter 0 letter_0 = word[0] hash_count_increment(M_hashes[ell][0], letter_0) # Histogram for letter 0 to letter 1 if ell < 2: continue letter_1 = word[1] if letter_0 not in M_hashes[ell][1]: M_hashes[ell][1][letter_0] = {} hash_count_increment(M_hashes[ell][1][letter_0], letter_1) # Transition histogram for letters k-2,k-1 to k if ell < 3: continue for k in range(2, ell): letter_i = word[k-2] letter_j = word[k-1] letter_k = word[k] if letter_i not in M_hashes[ell][k]: M_hashes[ell][k][letter_i] = {} if letter_j not in M_hashes[ell][k][letter_i]: M_hashes[ell][k][letter_i][letter_j] = {} hash_count_increment(M_hashes[ell][k][letter_i][letter_j], letter_k) fp.close() return [ells_hash, M_hashes, num_words, max_word_len] # ---------------------------------------------------------------- def ell_and_M_histos_to_prbs(ells_hash, M_hashes, num_words, max_word_len): # Convert histograms to probabilities. for key in ells_hash: ells_hash[key] /= num_words for ell in range(1, max_word_len+1): if ell not in M_hashes: continue # No words of this length were encountered. # Scale the histogram of 0th letters down to a probability of 1st # letters. count_0 = 0 for letter_0 in M_hashes[ell][0]: count_0 += M_hashes[ell][0][letter_0] for letter_0 in M_hashes[ell][0]: M_hashes[ell][0][letter_0] /= count_0 # Scale the histogram of letter-0-to-letter-1 transitions down to # probabilities. if ell < 2: continue for letter_0 in M_hashes[ell][1]: count_01 = 0 for letter_1 in M_hashes[ell][1][letter_0]: count_01 += M_hashes[ell][1][letter_0][letter_1] for letter_1 in M_hashes[ell][1][letter_0]: M_hashes[ell][1][letter_0][letter_1] /= count_01 # For each subsequent letter, scale the transition histogram down to # transition probabilities. if ell < 3: continue for k in range(2, ell): for letter_i in M_hashes[ell][k]: for letter_j in M_hashes[ell][k][letter_i]: count_ijk = 0 for letter_k in M_hashes[ell][k][letter_i][letter_j]: count_ijk += M_hashes[ell][k][letter_i][letter_j][letter_k] for letter_k in M_hashes[ell][k][letter_i][letter_j]: M_hashes[ell][k][letter_i][letter_j][letter_k] /= count_ijk # ---------------------------------------------------------------- def hashes_to_CDFs(ells_hash, M_hashes, num_words, max_word_len): # These arrays are indexed by word length. Zero-length words have # probability 0. ell_CDF = [0.0] M_CDFs = [0] * (max_word_len + 1) # Convert the ells hash to a CDF. ell_CDF = hash_to_CDF(ells_hash) # Convert the Ms hashes to CDFs. # Loop over word lengths. for ell in range(1, max_word_len+1): if ell not in M_hashes: continue # No words of this length were encountered. # M_CDFs[ell][0] [idx][k] = [letter_k, C] # M_CDFs[ell][1][letter_i] [idx][k] = [letter_k, C] # M_CDFs[ell][k][letter_i][letter_j][idx][k] = [letter_k, C] M_CDFs[ell] = [0] * ell # Make a CDF for letter 0. M_CDFs[ell][0] = hash_to_CDF(M_hashes[ell][0]) # Make CDFs for letter-0-to-letter-1 transitions, hashed on letter 0. if ell < 2: continue M_CDFs[ell][1] = {} for letter_0 in M_hashes[ell][1]: M_CDFs[ell][1][letter_0] = hash_to_CDF(M_hashes[ell][1][letter_0]) # For each letter occurring at subsequent position k, make a CDF for # letters k-2,k-1 -> letter k. if ell < 3: continue for k in range(2, ell): M_CDFs[ell][k] = {} for letter_i in M_hashes[ell][k]: M_CDFs[ell][k][letter_i] = {} for letter_j in M_hashes[ell][k][letter_i]: M_CDFs[ell][k][letter_i][letter_j] = \ hash_to_CDF(M_hashes[ell][k][letter_i][letter_j]) ###print '???? M_CDFs_ell= ', M_CDFs_ell ###print '!! ', M_CDFs return [ell_CDF, M_CDFs] # ---------------------------------------------------------------- def make_random_word(ell_CDF, M_CDFs): word = "" # Choose the word length. ell = random_selection_from_CDF(ell_CDF) # Choose letter 0. letter_0 = random_selection_from_CDF(M_CDFs[ell][0]) word += letter_0 if ell < 2: return word # Choose letter 1. letter_1 = random_selection_from_CDF(M_CDFs[ell][1][letter_0]) word += letter_1 if ell < 3: return word # Choose subsequent letters. letter_i = letter_0 letter_j = letter_1 for k in range(2, ell): letter_k = random_selection_from_CDF(M_CDFs[ell][k][letter_i][letter_j]) word += letter_k letter_i = letter_j letter_j = letter_k return word # ================================================================ # ells_hash: hashes word length ell to probability that word length is ell. # M_hashes: hashes word length ell to an array: # Entry 0 is the empty list. # Entry 1 is a hash of probabilities for letter 0. # Entries 2 to ell-1 are hashes of hashes for letter j-1 transitioning to # letter j. # Entry ell is a hash of probabilities for letter ell. word_list_file_name = 'wordlists/input-english-gsl-2000.txt' noutput = 20 print_extras = 0 if len(sys.argv) == 2: word_list_file_name = sys.argv[1] if len(sys.argv) == 3: word_list_file_name = sys.argv[1] noutput = int(sys.argv[2]) if len(sys.argv) == 4: word_list_file_name = sys.argv[1] noutput = int(sys.argv[2]) print_extras = int(sys.argv[3]) # ---------------------------------------------------------------- if print_extras: print 'Computing histograms from word-list file \"%s\":' % \ (word_list_file_name) [ells_hash, M_hashes, num_words, max_word_len] = \ populate_ell_and_M_histos(word_list_file_name) if print_extras: print '' print 'Word-length counts:' total = 0 for ell in ells_hash: ell_count = ells_hash[ell] print '%2d %7d' % (ell, ell_count) total += ell_count print 'Total: %7d' % (total) print '' if print_extras: print 'Scaling histograms to probabilities:' ell_and_M_histos_to_prbs(ells_hash, M_hashes, num_words, max_word_len) if print_extras: print 'Word-length probablities:' for ell in ells_hash: print '%2d %8.5f%%' % (ell, 100.0*ells_hash[ell]) print '' if print_extras: print 'Converting probabilities to CDFs:' [ell_CDF, M_CDFs] = \ hashes_to_CDFs(ells_hash, M_hashes, num_words, max_word_len) if print_extras: print 'Generating output:' print '' for k in range(0, noutput): word = make_random_word(ell_CDF, M_CDFs) print word