| 1 | #!/usr/bin/python2
 | 
  
    | 2 | # -*- coding: utf-8 -*-
 | 
  
    | 3 | #
 | 
  
    | 4 | # Projde ASF soubor a vytahne z nej vsechna slova (a prozodicka slova - zatim jen velmi zjednodusena implemntace detekce!). Ze ziskanych
 | 
  
    | 5 | # slov (a p-slov) vytvori datove soubory pro MATLAB, ktere se pak pouziji pro pokusy s mernim precepcni podobnosti techto slov.
 | 
  
    | 6 | #
 | 
  
    | 7 | 
 | 
  
    | 8 | 
 | 
  
    | 9 | import copy
 | 
  
    | 10 | import re
 | 
  
    | 11 | import os.path
 | 
  
    | 12 | import sys
 | 
  
    | 13 | 
 | 
  
    | 14 | import mlf
 | 
  
    | 15 | import asf
 | 
  
    | 16 | import pm
 | 
  
    | 17 | import wavext
 | 
  
    | 18 | 
 | 
  
    | 19 | 
 | 
  
    | 20 | ## Fonem, ktery ma byt uprostred
 | 
  
    | 21 | phone                = 'U'
 | 
  
    | 22 | 
 | 
  
    | 23 | ## Klic urcujici index slova
 | 
  
    | 24 | wordattr_index       = "index"
 | 
  
    | 25 | 
 | 
  
    | 26 | ## Klic urcujici fonetickou reprezentaci slova
 | 
  
    | 27 | wordattr_phones      = "phones"
 | 
  
    | 28 | ## Klic urcujici jednotky v danem slove, jednotky typu dict s atributy nactenymiz MLF
 | 
  
    | 29 | wordattr_units       = "units"
 | 
  
    | 30 | ## Klic urcujici vetu z niz slovo pochazi
 | 
  
    | 31 | wordattr_sentence    = "sentence"
 | 
  
    | 32 | 
 | 
  
    | 33 | ## Pole pitch-marku odpovidajici danemu slovu [pm.OnePn, ...] (jeden kanal)
 | 
  
    | 34 | wordattr_pmarks      = "pmarks"
 | 
  
    | 35 | ## Pole recovych vzorku odpovidajici danemu slovu [int, ...]. Signal neni nijak upraven (zadne vazeni na zacatku/konci)
 | 
  
    | 36 | wordattr_speech      = "speech"
 | 
  
    | 37 | ## Vzorkovaci frekvence (int`)
 | 
  
    | 38 | wordattr_sampfreq    = "sf"
 | 
  
    | 39 | 
 | 
  
    | 40 | ## Pole recovych vzorku PRED vzorky slova [int, ...]. Signal neni nijak upraven (zadne vazeni na zacatku/konci)
 | 
  
    | 41 | #  Signal slova plus kontext se vytvori prostym spojenim poli: data[wordattr_speech_lctx] + data[wordattr_speech]
 | 
  
    | 42 | wordattr_speech_lctx = "speech_lctx"
 | 
  
    | 43 | ## Pole recovych vzorku ZA vzorky slova [int, ...]. Signal neni nijak upraven (zadne vazeni na zacatku/konci)
 | 
  
    | 44 | #  Signal slova plus kontext se vytvori prostym spojenim poli: data[wordattr_speech] + data[wordattr_speech_rctx]
 | 
  
    | 45 | wordattr_speech_rctx = "speech_rctx"
 | 
  
    | 46 | 
 | 
  
    | 47 | ## Jmeno ASF souboru, z nehiz se vytvori seznam slov - globalni promenna POUZE pro cteni (naplnena jen jednou v main)
 | 
  
    | 48 | asf_fname            = None
 | 
  
    | 49 | ## Cesta k .pm souborum - globalni promenna POUZE pro cteni (naplnena jen jednou v main)
 | 
  
    | 50 | pmk_dpath            = None
 | 
  
    | 51 | ## Cesta k .wav souborum - globalni promenna POUZE pro cteni (naplnena jen jednou v main)
 | 
  
    | 52 | wav_dpath            = None
 | 
  
    | 53 | 
 | 
  
    | 54 | ## Atribut, ktery v MLF (nebo ASF) souboru obsahuje jmeno jednotky. Defaultne nastaven na Mlf.attr_modelName
 | 
  
    | 55 | mlfattr_modelName    = mlf.Mlf.attr_modelName
 | 
  
    | 56 | ## Atribut, ktery v MLF (nebo ASF) souboru obsahuje typ prozodemu.
 | 
  
    | 57 | mlfattr_prosodmType  = "prosodemType"
 | 
  
    | 58 | 
 | 
  
    | 59 | 
 | 
  
    | 60 | 
 | 
  
    | 61 | #
 | 
  
    | 62 | #  -------- MAIN --------
 | 
  
    | 63 | #
 | 
  
    | 64 | def main() :
 | 
  
    | 65 | 
 | 
  
    | 66 |     # Set into global variable ...
 | 
  
    | 67 |     global mlfattr_modelName
 | 
  
    | 68 |     global asf_fname
 | 
  
    | 69 |     global pmk_dpath
 | 
  
    | 70 |     global wav_dpath
 | 
  
    | 71 | 
 | 
  
    | 72 | 
 | 
  
    | 73 |     # Vypis priklad pouziti ...
 | 
  
    | 74 |     # vyparsuj argumenty
 | 
  
    | 75 |     # TODO: dodelat!!!
 | 
  
    | 76 |     asf_fname = "/home/dtihelka/Experiments/CCfeats_1class-Classifier/features/spkr_AJ/corpus.rev563.asf"
 | 
  
    | 77 |     wav_dpath = "/home/dtihelka/mnt_ArticServer/Projects/cz/anderle_jan/data/non-mastered/zkracene-pauzy/speech/"
 | 
  
    | 78 |     pmk_dpath = "/home/dtihelka/mnt_ArticServer/Projects/cz/anderle_jan/data/non-mastered/zkracene-pauzy/pitch-marks/"
 | 
  
    | 79 |     wav_fext  = ".wav"
 | 
  
    | 80 |     pmk_fext  = ".pm"
 | 
  
    | 81 |     out_dpath = "/home/dtihelka/Experiments/CCfeats_1class-Classifier/negative_examples/"
 | 
  
    | 82 | 
 | 
  
    | 83 | 
 | 
  
    | 84 |     # -------
 | 
  
    | 85 |     # Nacti ASF soubor
 | 
  
    | 86 |     print "Nacitam data z ASF souboru: %s" % asf_fname
 | 
  
    | 87 |     asf_data  = asf.ASF(asf_fname)
 | 
  
    | 88 |     # Jmeno atributu s jednotkami ...
 | 
  
    | 89 |     mlfattr_modelName = asf_data.get_mlf2asf_attribmap()[mlf.Mlf.attr_modelName]
 | 
  
    | 90 | 
 | 
  
    | 91 |     # -------
 | 
  
    | 92 |     # Vytvor hash obsahujici vsechna slova (i prozodicka) a jejich instance z MLF
 | 
  
    | 93 |     print "Vytvarim seznam slov ..."
 | 
  
    | 94 |     words     = get_words(asf_data)
 | 
  
    | 95 |     pwords    = get_pwords(asf_data)
 | 
  
    | 96 |     # ASF uz nebude treba
 | 
  
    | 97 |     asf_data  = None
 | 
  
    | 98 |     # Sluc slova do jednoho pole
 | 
  
    | 99 | #    words     = merge_words(words, pwords)
 | 
  
    | 100 | #    pwords    = None
 | 
  
    | 101 |     # Vypis info
 | 
  
    | 102 |     print "Bylo ziskano   %d slov (vcetne pros. slov)" % len(words.keys())
 | 
  
    | 103 | 
 | 
  
    | 104 | 
 | 
  
    | 105 |     # -------
 | 
  
    | 106 |     # Filtruj slova
 | 
  
    | 107 |     print "Filtruji slova ..."
 | 
  
    | 108 | 
 | 
  
    | 109 |     #####
 | 
  
    | 110 |     #####
 | 
  
    | 111 |     #words = filter_word_occurs(words, word_phnlen_min = 8, word_occurs_max = 15)
 | 
  
    | 112 |     #print_prosodemestats(words, {"0" : 6})
 | 
  
    | 113 |     #print_prosodemestats(words, {"0" : 5, "1" : 4})
 | 
  
    | 114 |     #print_prosodemestats(words, {"0" : 5, "3" : 4})
 | 
  
    | 115 |     #print_prosodemestats(words, {"1" : 4, "3" : 4})
 | 
  
    | 116 |     #print_prosodemestats(words, {"0" : 5, "1" : 4, "3" : 4})
 | 
  
    | 117 |     #print_prosodemestats(words, __NAHRAD_SEDEM__)
 | 
  
    | 118 |     ###
 | 
  
    | 119 |     #words = filter_word_occurs(words, word_phnlen_min = 8, word_occurs_min = 5)
 | 
  
    | 120 |     #print_prosodemestats(words, {"1" : None, "3" : None})
 | 
  
    | 121 |     #print_prosodemestats(words, {"0" : None, "1" : None, "3" : None})
 | 
  
    | 122 |     #return
 | 
  
    | 123 |     #####
 | 
  
    | 124 |     #####
 | 
  
    | 125 | 
 | 
  
    | 126 |     words = filter_word_occurs(words, word_phnlen_min = 8, word_occurs_min = 3)
 | 
  
    | 127 |     #words = filter_word_texts( words, ("spoluprAce",   "vminulosTi", "prUmislovIx", "viSetRovAJI", "informacI",  "zAkazJIkU",
 | 
  
    | 128 |     #                                       "spoRitelni",   "potvrzuje",  "projektu",    "!opatReJI",   "zAleZitost", "!ekonomiki",
 | 
  
    | 129 |     #                                       "zamJestnancU", "kancelARe",  "primAtora",   "zAstupce",    "veRejnosTi", "hospodARstvI",
 | 
  
    | 130 |     #                                       "pozornost",    "policistU",  "sJemovni",    "ZelezJiCJI",  "republice",   "nAmJesTI,
 | 
  
    | 131 |     #                                       "nemUZeme"))
 | 
  
    | 132 |     #words = filter_word_texts( words, ("nemUZeme", "novinARUm",  "republice",  "rospoCtu",  "zdUrazJil",
 | 
  
    | 133 |     #                                       "problEmU", "nasvjeTe",   "konkurence", "Clovjeka",  "potravin"))
 | 
  
    | 134 | #    words = filter_word_texts(words, ("spoluprAce",   "prUmislovIx", "potravin",  "zAkazJIkU", "vminulosTi", "nasvjeTe",
 | 
  
    | 135 | #                                      "novinARUm",    "nemUZeme",    "pozornost", "informacI", "konkurence", "Clovjeka",
 | 
  
    | 136 | #                                      "hospodARstvI", "republice",   "rospoCtu",  "problEmU",  "zdUrazJil",  "!UsmJevem"))
 | 
  
    | 137 |     words = filter_word_midphon(words, phone)
 | 
  
    | 138 |     print "Bylo ponechano %d slov (vcetne pros. slov)" % len(words.keys())
 | 
  
    | 139 | 
 | 
  
    | 140 |     # Ulozi do ASF
 | 
  
    | 141 |     store_to_asf(words, out_dpath)
 | 
  
    | 142 | 
 | 
  
    | 143 | #    k = words.keys()[0]
 | 
  
    | 144 | #    print 'slovo:', k
 | 
  
    | 145 | #    print 'data:',  words[k][0]
 | 
  
    | 146 | 
 | 
  
    | 147 |     # Vyfiltruj podle pozadovaneho fonemu
 | 
  
    | 148 | 
 | 
  
    | 149 | 
 | 
  
    | 150 |     return
 | 
  
    | 151 | 
 | 
  
    | 152 | 
 | 
  
    | 153 | 
 | 
  
    | 154 |     # -------
 | 
  
    | 155 |     # Ukladam slova
 | 
  
    | 156 |     print "Pridavam ke slovum recova data a pitch-marky"
 | 
  
    | 157 | 
 | 
  
    | 158 |     # Pridej ke slovum ostatni atributy pro ulozeni
 | 
  
    | 159 |     words = add_word_attribs(words, pmk_dpath, pmk_fext, wav_dpath, wav_fext, wav_cntx = 0.8)
 | 
  
    | 160 | 
 | 
  
    | 161 |     # A uloz je
 | 
  
    | 162 |     print "Ukladam slova ..."
 | 
  
    | 163 |     store_to_mfile(words, out_dpath)
 | 
  
    | 164 | #    store_to_asf  (words, out_dpath)
 | 
  
    | 165 |     store_to_wav  (words, out_dpath)
 | 
  
    | 166 | 
 | 
  
    | 167 |     # Hotovo ...
 | 
  
    | 168 |     print "Hotovo."
 | 
  
    | 169 | 
 | 
  
    | 170 | 
 | 
  
    | 171 | 
 | 
  
    | 172 | 
 | 
  
    | 173 | 
 | 
  
    | 174 | 
 | 
  
    | 175 | ##
 | 
  
    | 176 | # Z ASF souboru vypreparuje seznam vsech slov a jim odpovidajicich jednotek
 | 
  
    | 177 | #
 | 
  
    | 178 | # @param  asfdata trida s daty z ASF souboru (instance asf.ASF). Musi obsahovat (krom libovolnych jinych) sloupce, ktere tvori platny
 | 
  
    | 179 | #         MLF soubor (casy, jmena jednotek, slova, pradvepodobnost neni nutna ...)!
 | 
  
    | 180 | # @return hlubokou (!) kopii vsech nalezenych slov a jejich atributu:
 | 
  
    | 181 | #         {slovo : [{wordattr_units : [{dict z MLF}, ...], wordattr_sentence : string}, ...], slovo : [...]
 | 
  
    | 182 | #
 | 
  
    | 183 | def get_words(asfdata) :
 | 
  
    | 184 | 
 | 
  
    | 185 |     # Obal ASF tridu MLF tridou (nedelej kopii, data se nemeni)
 | 
  
    | 186 |     wordlst = {}
 | 
  
    | 187 |     mlfdata = asf.ASF()  # TODO: change to mlf.Mlf
 | 
  
    | 188 |     mlfdata.from_asf(asfdata, deep_copy = False)
 | 
  
    | 189 | 
 | 
  
    | 190 |     # Mame fony jako jednotky?
 | 
  
    | 191 |     segtype = mlfdata.get_comment_attrib("unitType")
 | 
  
    | 192 |     if segtype != "phone" :
 | 
  
    | 193 |        print "\nPOZOR: Neznamy typ jednotek '%s' (ocekavany jsou fony)\n\n" % str(segtype)
 | 
  
    | 194 | 
 | 
  
    | 195 |     # Vypreparuj vsechna slova z mlf a uloz po dole hashu ...
 | 
  
    | 196 |     for sentence, words in mlfdata.word_insts(mlfdata.word_occs().keys()).items() :
 | 
  
    | 197 |         for w in words :
 | 
  
    | 198 |             # Text slova
 | 
  
    | 199 |             wordtext = w['trans']
 | 
  
    | 200 |             wordsegs = w['segments']
 | 
  
    | 201 | 
 | 
  
    | 202 |             # Je slovo uz v hashi?
 | 
  
    | 203 |             if not wordlst.has_key(wordtext) :
 | 
  
    | 204 |                wordlst[wordtext] = []
 | 
  
    | 205 |             # Pridej slovo do seznamu. Jmena atributu urci mlf.Mlf, ale nejsou tam definovane jako konstanty
 | 
  
    | 206 |             wordlst[wordtext].append({wordattr_units : wordsegs, wordattr_sentence : sentence})
 | 
  
    | 207 | 
 | 
  
    | 208 |             # Otestuj, jestli se text vytvoreny ze jmen jednotek shoduje s textem v atributu 'text'
 | 
  
    | 209 |             # Pouze pro jistotu (vim, jsem paranoidni ...)
 | 
  
    | 210 |             segtext  = ''.join([u[mlfdata.attr_modelName] for u in wordsegs])
 | 
  
    | 211 |             if wordtext != segtext :
 | 
  
    | 212 |                raise Exception, "Neshoda textu slova z MLF '%s' a textu se segmentu jednotek '%s'" % (wordtext, segtext)
 | 
  
    | 213 | 
 | 
  
    | 214 |     # Vrat kopii
 | 
  
    | 215 |     return copy.deepcopy(wordlst)
 | 
  
    | 216 | 
 | 
  
    | 217 | 
 | 
  
    | 218 | ##
 | 
  
    | 219 | # Z ASF souboru vypreparuje seznam vsech prozodickych slov a jim odpovidajicich jednotek
 | 
  
    | 220 | #
 | 
  
    | 221 | # @param  asfdata trida s daty z ASF souboru (instance asf.ASF). Musi obsahovat (krom libovolnych jinych) sloupce, ktere tvori platny
 | 
  
    | 222 | #         MLF soubor (casy, jmena jednotek, slova, pradvepodobnost neni nutna ...)!
 | 
  
    | 223 | # @return hlubokou (!) kopii vsech nalezenych prozodickych slov a jejich atributu:
 | 
  
    | 224 | #         {slovo : [{wordattr_units : [{dict z MLF}, ...], wordattr_sentence : string}, ...], slovo : [...]
 | 
  
    | 225 | #
 | 
  
    | 226 | def get_pwords(asfdata) :
 | 
  
    | 227 | 
 | 
  
    | 228 |     # Zkopiruj data ...
 | 
  
    | 229 | #    data = copy.deepcopy(asfdata)
 | 
  
    | 230 |     mlfdata = asf.ASF()  # TODO: change to mlf.Mlf
 | 
  
    | 231 |     mlfdata.from_asf(asfdata, deep_copy = True)
 | 
  
    | 232 |     # The sequence of words in the prosodic word
 | 
  
    | 233 |     words   = None
 | 
  
    | 234 |     wordbeg = None
 | 
  
    | 235 | 
 | 
  
    | 236 |     # Vytvori prozodicka slova, pokud uz v ASF nejsou. Vyuzije se priznak "pwordBoundPos"
 | 
  
    | 237 |     for sentence, units in mlfdata.getutts().items() :
 | 
  
    | 238 |         for i,unit in enumerate(units) :
 | 
  
    | 239 |             # If the unit is pause, just ignore it
 | 
  
    | 240 |             if   unit['pwordBoundPos'] in 'PS' :
 | 
  
    | 241 |                  continue
 | 
  
    | 242 |             # If the unit is first in the prosodic word, remember it and add its word ()
 | 
  
    | 243 |             elif unit['pwordBoundPos'] == 'F' :
 | 
  
    | 244 |                  words   = [unit['word'], ]
 | 
  
    | 245 |                  wordbeg = i
 | 
  
    | 246 |             # If the unit is last in the prosodic word, fill the prosodic word
 | 
  
    | 247 |             elif unit['pwordBoundPos'] == 'L' :
 | 
  
    | 248 |                  # Create the string
 | 
  
    | 249 |                  pword = '|'.join([w for w in words if w != None])
 | 
  
    | 250 |                  words = None
 | 
  
    | 251 |                  # Store to ASF
 | 
  
    | 252 |                  mlfdata.set_attrib(sentence, wordbeg, 'pword', pword)
 | 
  
    | 253 |             else :
 | 
  
    | 254 |                  words.append(unit.get('word', None))
 | 
  
    | 255 | 
 | 
  
    | 256 |     # Zmen sloupec se slovy na prozodicka slova ... Trosku hack, ale pak muzeme vyuzit uz existujici kod pro "normalni" slova
 | 
  
    | 257 |     mlfdata.add_asf2mlf_attribmap("pword", mlf.Mlf.attr_word)
 | 
  
    | 258 | 
 | 
  
    | 259 |     # Vrat seznam prozodickych slov ..
 | 
  
    | 260 |     return get_words(mlfdata)
 | 
  
    | 261 | 
 | 
  
    | 262 | 
 | 
  
    | 263 | ##
 | 
  
    | 264 | # Udela sjednoceni slov - pokud je stejne slovo v obou hashich, pak udelej sjednoceni atributu, jinak kopiruj cela slova
 | 
  
    | 265 | #
 | 
  
    | 266 | # @param
 | 
  
    | 267 | # @param
 | 
  
    | 268 | # @return novou hash s hlubokymi (!) kopiemi sjednoceni slov
 | 
  
    | 269 | #
 | 
  
    | 270 | def merge_words(words1, words2) :
 | 
  
    | 271 | 
 | 
  
    | 272 |     ## Testuje, jestli je instance slova 'word' obsazena v poli 'words'. Instance slov jsou zadana jako hash parametru
 | 
  
    | 273 |     def contains_word(words, word) :
 | 
  
    | 274 |         return True in [          w[wordattr_sentence]                                 ==           word[wordattr_sentence]                                 and \
 | 
  
    | 275 |                               len(w[wordattr_units])                                   ==       len(word[wordattr_units])                                   and \
 | 
  
    | 276 |                         int(float(w[wordattr_units][0 ][mlf.Mlf.attr_begTime])*100000) == int(float(word[wordattr_units][0 ][mlf.Mlf.attr_begTime])*100000) and \
 | 
  
    | 277 |                         int(float(w[wordattr_units][-1][mlf.Mlf.attr_endTime])*100000) == int(float(word[wordattr_units][-1][mlf.Mlf.attr_endTime])*100000) for w in words]
 | 
  
    | 278 | 
 | 
  
    | 279 |     # Zkopiruj prvni hash
 | 
  
    | 280 |     merged = words1
 | 
  
    | 281 |     # Prochazej druhou hash a pridavej do ni slova
 | 
  
    | 282 |     for word_text, word_cands in words2.iteritems() :
 | 
  
    | 283 |         # Pokud slovo neni v hashi, zkopiruj jej cele, jinak pridej jen kandidaty, kteri jeste nejsou
 | 
  
    | 284 |         if not merged.has_key(word_text) : merged[word_text] =                  word_cands
 | 
  
    | 285 |         else                             : merged[word_text].extend([w for w in word_cands if not contains_word(merged[word_text], w)])
 | 
  
    | 286 | 
 | 
  
    | 287 |     # Vrat vyslednou hash
 | 
  
    | 288 |     return copy.deepcopy(merged)
 | 
  
    | 289 | 
 | 
  
    | 290 | 
 | 
  
    | 291 | 
 | 
  
    | 292 | ##
 | 
  
    | 293 | # Zachova jen slova s pozadovanym minimalnim poctem vyskytu a minimalni delkou
 | 
  
    | 294 | #
 | 
  
    | 295 | # @param  words       hash se slovy, primo tato hash se upravuje!
 | 
  
    | 296 | # @param  word_phnlen_min minimalni delka slova ve fonech (int). Pokud neni definovan, neuvazuje se
 | 
  
    | 297 | # @param  word_phnlen_max maximalni delka slova ve fonech (int). Pokud neni definovan, neuvazuje se
 | 
  
    | 298 | # @param  word_occurs_min minimalni pocet vyskytu slova (int). Pokud neni definovan, neuvazuje se
 | 
  
    | 299 | # @param  word_occurs_max maximalni pocet vyskytu slova (int). Pokud neni definovan, neuvazuje se
 | 
  
    | 300 | # @return upravenou hash 'words' (primo, nikoli kopie)
 | 
  
    | 301 | #
 | 
  
    | 302 | def filter_word_occurs(words, word_phnlen_min = 0, word_phnlen_max = sys.maxint, word_occurs_min = 0, word_occurs_max = sys.maxint) :
 | 
  
    | 303 | 
 | 
  
    | 304 |     # Projdi slova
 | 
  
    | 305 |     for word_text, word_cands in words.items() :
 | 
  
    | 306 |         if len(word_text) < word_phnlen_min or len(word_text) > word_phnlen_max or len(word_cands) < word_occurs_min or len(word_cands) > word_occurs_max :
 | 
  
    | 307 |            del words[word_text]
 | 
  
    | 308 | 
 | 
  
    | 309 |     # Vrati vysledek
 | 
  
    | 310 |     return words
 | 
  
    | 311 | 
 | 
  
    | 312 | ##
 | 
  
    | 313 | # Zachova jen slova, ktera obsahuji uprostred pozadovany fon s pozadovanym minimalnim poctem vyskytu a minimalni delkou
 | 
  
    | 314 | #
 | 
  
    | 315 | # @param  words hash se slovy, primo tato hash se upravuje!
 | 
  
    | 316 | # @param  phone pozadovany fon, ktery se hleda zhruba uprosted slova
 | 
  
    | 317 | # @return upravenou hash 'words' (primo, nikoli kopie)
 | 
  
    | 318 | #
 | 
  
    | 319 | def filter_word_midphon(words, phone) :
 | 
  
    | 320 | 
 | 
  
    | 321 |     # Projdi slova
 | 
  
    | 322 |     for word_text in words.keys() :
 | 
  
    | 323 |         # Interval okolo prestredniho fonu
 | 
  
    | 324 |         mid = len(word_text)/2.0
 | 
  
    | 325 |         beg = int(mid - 1.0)
 | 
  
    | 326 |         end = int(mid + 1.0)
 | 
  
    | 327 |         # Je tam pozadovany fon?
 | 
  
    | 328 |         if not phone in word_text[beg:end+1] :
 | 
  
    | 329 |            del words[word_text]
 | 
  
    | 330 |            continue
 | 
  
    | 331 | 
 | 
  
    | 332 |         # Mark the phone otherwise
 | 
  
    | 333 |         indx = word_text.find(phone, beg, end+1)
 | 
  
    | 334 |         # Vsechna slova
 | 
  
    | 335 |         for word in words[word_text] :
 | 
  
    | 336 |             segs =  word['units']
 | 
  
    | 337 |             # Add feature
 | 
  
    | 338 |             segs[indx]['concatenate'] = '*><*'
 | 
  
    | 339 | 
 | 
  
    | 340 |     # Vrati vysledek
 | 
  
    | 341 |     return words
 | 
  
    | 342 | 
 | 
  
    | 343 | ##
 | 
  
    | 344 | # Zachova jen pozadovana slova
 | 
  
    | 345 | #
 | 
  
    | 346 | # @param  words hash se slovy, primo tato hash se upravuje!
 | 
  
    | 347 | # @param  texts pole slov ktera se maji zachovat [string, ...]
 | 
  
    | 348 | # @return upravenou hash 'words' (primo, nikoli kopie)
 | 
  
    | 349 | #
 | 
  
    | 350 | def filter_word_texts(words, texts) :
 | 
  
    | 351 | 
 | 
  
    | 352 |     # Projdi slova
 | 
  
    | 353 |     for word_text in words.keys() :
 | 
  
    | 354 |         if not True in [word_text == t for t in texts] :
 | 
  
    | 355 |            del words[word_text]
 | 
  
    | 356 | 
 | 
  
    | 357 |     # Vrati vysledek
 | 
  
    | 358 |     return words
 | 
  
    | 359 | 
 | 
  
    | 360 | ##
 | 
  
    | 361 | # Ke kazdemu slovu prida priznaky 'wordattr_pmarks', 'wordattr_speech' a 'wordattr_sampfreq', ostatni priznaky zustanou nezmeneny!
 | 
  
    | 362 | #
 | 
  
    | 363 | # @param  words hash se slovy, primo tato hash se upravuje!
 | 
  
    | 364 | # @param
 | 
  
    | 365 | # @param
 | 
  
    | 366 | # @param
 | 
  
    | 367 | # @param
 | 
  
    | 368 | # @param  wav_cntx pocet sekund, kolik recoveho signalu se ulozi pred a za slovem
 | 
  
    | 369 | # @return upravenou hash 'words' (primo, nikoli kopie)
 | 
  
    | 370 | #
 | 
  
    | 371 | def add_word_attribs(words, pmk_dpath, pmk_fext, wav_dpath, wav_fext, wav_cntx = None) :
 | 
  
    | 372 | 
 | 
  
    | 373 |     # Projdi slova a vsechny jejich kandidaty
 | 
  
    | 374 |     for word_text,  word_cands in words.items() :
 | 
  
    | 375 |         print "     ---> pridavam atributy pro slovo %s (%d kandidatu)" % (word_text, len(word_cands))
 | 
  
    | 376 | 
 | 
  
    | 377 |         for indx,cand in enumerate(word_cands) :
 | 
  
    | 378 | 
 | 
  
    | 379 |             # Nacti pitch-marky a rec
 | 
  
    | 380 |             pmk_file = pm.Pm(         os.path.join(pmk_dpath, cand[wordattr_sentence] + pmk_fext), pm.Pm.shift_nearest)
 | 
  
    | 381 |             wav_file = wavext.WavRead(os.path.join(wav_dpath, cand[wordattr_sentence] + wav_fext))
 | 
  
    | 382 | 
 | 
  
    | 383 |             # Casy zacatku a koncu vsech jednotek ve slovech zarovnej na pitch-marky. Pokud uz jsou zarovnane, nic se vlastne
 | 
  
    | 384 |             # nezmeni
 | 
  
    | 385 |             for u in cand[wordattr_units] :
 | 
  
    | 386 |                 u[mlf.Mlf.attr_begTime] = pmk_file.find_pmk(float(u[mlf.Mlf.attr_begTime]), skip_T = True).get_time()
 | 
  
    | 387 |                 u[mlf.Mlf.attr_endTime] = pmk_file.find_pmk(float(u[mlf.Mlf.attr_endTime]), skip_T = True).get_time()
 | 
  
    | 388 | 
 | 
  
    | 389 |             # Zacatek a konec slova
 | 
  
    | 390 |             beg_time = float(cand[wordattr_units][ 0][mlf.Mlf.attr_begTime])
 | 
  
    | 391 |             end_time = float(cand[wordattr_units][-1][mlf.Mlf.attr_endTime])
 | 
  
    | 392 | 
 | 
  
    | 393 |             # Vzorkovaci frekvence
 | 
  
    | 394 |             sampfreq = wav_file.getframerate()
 | 
  
    | 395 |             wavlengt = wav_file.getnframes() / float(sampfreq)
 | 
  
    | 396 | 
 | 
  
    | 397 |             # Napln pole recovych dat a pitch-marku pro cele slovo.
 | 
  
    | 398 |             # Uprav pole pitch-marku tak, aby zacinalo na 0, zaroven preskoc T pitch-marky. Pridej 1 - matlab cisluje od 1
 | 
  
    | 399 |             wav_data = tuple( wav_file[int(beg_time * sampfreq) : int(end_time * sampfreq) +1])
 | 
  
    | 400 |             pmk_data = tuple([pm.OnePm((p.get_time() - beg_time) * sampfreq +1, p.get_type()) for p in pmk_file.get_pmks(beg_time, end_time) if p.get_type() != p.type_transitional])
 | 
  
    | 401 | 
 | 
  
    | 402 |             # Uloz do atributu slova
 | 
  
    | 403 |             cand[wordattr_pmarks]   = pmk_data
 | 
  
    | 404 |             cand[wordattr_speech]   = wav_data
 | 
  
    | 405 |             cand[wordattr_sampfreq] = sampfreq
 | 
  
    | 406 |             cand[wordattr_index]    = indx +1   # Index from 1
 | 
  
    | 407 | 
 | 
  
    | 408 |             # Jeste pridej kontext, pokud je pozadovan
 | 
  
    | 409 |             if wav_cntx and wav_cntx > 0.0 :
 | 
  
    | 410 |                # Posun casy
 | 
  
    | 411 |                ctx_beg = beg_time - wav_cntx
 | 
  
    | 412 |                ctx_end = end_time + wav_cntx
 | 
  
    | 413 |                # Nejsou mimo rozsah?
 | 
  
    | 414 |                if ctx_beg < 0.0      : ctx_beg = 0.0
 | 
  
    | 415 |                if ctx_end > wavlengt : ctx_end = wavlengt
 | 
  
    | 416 |                # A napln data
 | 
  
    | 417 |                cand[wordattr_speech_lctx] = tuple(wav_file[int(ctx_beg  * sampfreq) : int(beg_time * sampfreq)])
 | 
  
    | 418 |                cand[wordattr_speech_rctx] = tuple(wav_file[int(end_time * sampfreq) : int(ctx_end  * sampfreq)])
 | 
  
    | 419 | 
 | 
  
    | 420 | 
 | 
  
    | 421 |     # Vrat vysledek
 | 
  
    | 422 |     return words
 | 
  
    | 423 | 
 | 
  
    | 424 | 
 | 
  
    | 425 | ##
 | 
  
    | 426 | # Hash slov zapise do matlabovskych m-filu (jeden soubor na slovo, obsahuje vsechny kandidaty daneho slova)
 | 
  
    | 427 | #
 | 
  
    | 428 | # @param
 | 
  
    | 429 | # @param
 | 
  
    | 430 | #
 | 
  
    | 431 | def store_to_mfile(words, out_dpath) :
 | 
  
    | 432 | 
 | 
  
    | 433 |     # Vsechna slova
 | 
  
    | 434 |     for word_text, word_cands in words.items() :
 | 
  
    | 435 | 
 | 
  
    | 436 |         # Jmeno matlabovske funkce
 | 
  
    | 437 |         funct = word_text + "_data"
 | 
  
    | 438 | 
 | 
  
    | 439 |         # Zaloz novy m-file, bude se jmenovat stejne, jako je text slova
 | 
  
    | 440 |         mfile = open(os.path.join(out_dpath, funct + ".m"), "wt")
 | 
  
    | 441 |         # Vytvor jeho hlavicku
 | 
  
    | 442 |         mfile.write("function data = %s()\n" % funct)
 | 
  
    | 443 |         mfile.write("%\n")
 | 
  
    | 444 |         mfile.write("%% Funkce vrati data vsech vyskytu slova:          '%s'\n" % word_text)
 | 
  
    | 445 |         mfile.write("%% Data vyskytu slov jsou vygenerovana ze souboru: '%s'\n" % asf_fname)
 | 
  
    | 446 |         mfile.write("%% Data pitch-marku pochazi z adresare:            '%s'\n" % pmk_dpath)
 | 
  
    | 447 |         mfile.write("%% Recova data pochazi z adresare:                 '%s'\n" % wav_dpath)
 | 
  
    | 448 |         mfile.write("%% Soubor byl vytvoren automaticky skriptem:       '%s'\n" % sys.argv[0])
 | 
  
    | 449 |         mfile.write("%\n")
 | 
  
    | 450 |         mfile.write("% Vracena data jsou cell pole {i,j}, kde i je index slova a j je index dat v poradi:\n\n")
 | 
  
    | 451 |         mfile.write("%    1 - veta, ze ktere slovo pochazi\n")
 | 
  
    | 452 |         mfile.write("%    2 - casy zacatku slova v souboru [sec]\n")
 | 
  
    | 453 |         mfile.write("%    3 - cas konce slova v souboru [sec]\n")
 | 
  
    | 454 |         mfile.write("%    4 - pole recovych vzorku (short) odpovidajici slovu (zarovnane na p-mark, nijak nevazene)\n")
 | 
  
    | 455 |         mfile.write("%    5 - vzorkovaci frekvence (int cislo)\n")
 | 
  
    | 456 |         mfile.write("%    6 - pole pitch-marku jako indexy do recovych vzorku [1, ..., length(vzorky)]\n")
 | 
  
    | 457 |         mfile.write("%    7 - pole typu pitch-marku (retezec znaku)\n")
 | 
  
    | 458 |         mfile.write("%    8 - casy zacatku fonemu daneho slova jako indexy do recovych vzorku [1, ...]\n")
 | 
  
    | 459 |         mfile.write("%    9 - casy koncu fonemu daneho slova jako indexy do recovych vzorku [..., length(vzorky)]\n")
 | 
  
    | 460 |         mfile.write("%\n")
 | 
  
    | 461 |         mfile.write("\n\n")
 | 
  
    | 462 | 
 | 
  
    | 463 |         # Vykonny kod - predvytvor cell pole, ktere se bude vracet
 | 
  
    | 464 |         mfile.write("data = [\n")
 | 
  
    | 465 | 
 | 
  
    | 466 |         # ------
 | 
  
    | 467 |         # Zpracuj slovo po slovu a data uloz do m-filu
 | 
  
    | 468 |         for cand in word_cands :
 | 
  
    | 469 | 
 | 
  
    | 470 |             # Zacatek a konec slova
 | 
  
    | 471 |             beg_time = float(cand[wordattr_units][ 0][mlf.Mlf.attr_begTime])
 | 
  
    | 472 |             end_time = float(cand[wordattr_units][-1][mlf.Mlf.attr_endTime])
 | 
  
    | 473 |             sampfreq =       cand[wordattr_sampfreq]
 | 
  
    | 474 | 
 | 
  
    | 475 |             # Napln pole zacatku a koncu fonu ve slove, uprav tak, ze bude obsahovat rovnou index recoveho vzorku.
 | 
  
    | 476 |             # Pridej 1 - matlab cisluje od 1
 | 
  
    | 477 |             beg_phns = [int((float(u[mlf.Mlf.attr_begTime]) - beg_time) * sampfreq) +1 for u in cand[wordattr_units]]
 | 
  
    | 478 |             end_phns = [int((float(u[mlf.Mlf.attr_endTime]) - beg_time) * sampfreq) +1 for u in cand[wordattr_units]]
 | 
  
    | 479 |             # Jmeno souboru
 | 
  
    | 480 |             name     = "%02d_%s" % (cand[wordattr_index], cand[wordattr_sentence])
 | 
  
    | 481 | 
 | 
  
    | 482 |             # Uloz vse do m-filu
 | 
  
    | 483 |             mfile.write("        {")
 | 
  
    | 484 |             mfile.write("  '%s'  %9.5f  %9.5f" % (name, beg_time, end_time))
 | 
  
    | 485 |             mfile.write("  " + str(list(cand[wordattr_speech])))                                          # Recove vzorky, zkopiruj do listu: vyuzij zde (i dale) to, ze python tiskne str([]) nako "[v1, v2, ...]"
 | 
  
    | 486 |             mfile.write("  " +      str(sampfreq))                                                        # Vzorkovaci frekvence
 | 
  
    | 487 |             mfile.write("  " + str([int(p.get_time()) for p in cand[wordattr_pmarks]]))                   # Casy jako pole integeru (preved na int ale pricti 0.5 aby se odstranily aritmeticke chyby ...)
 | 
  
    | 488 |             mfile.write("  " + str([    p.get_type()  for p in cand[wordattr_pmarks]]).replace('"', "'")) # Typy jako pole znaku, je ale treba nahrasit uvozovky apostrofem
 | 
  
    | 489 |             mfile.write("  " + str(beg_phns))
 | 
  
    | 490 |             mfile.write("  " + str(end_phns))
 | 
  
    | 491 |             mfile.write("};\n")
 | 
  
    | 492 | 
 | 
  
    | 493 |         # Ukonci data v m-filu a zavri soubor
 | 
  
    | 494 |         mfile.write("       ];\n\n")
 | 
  
    | 495 |         mfile.close()
 | 
  
    | 496 | 
 | 
  
    | 497 | 
 | 
  
    | 498 | ##
 | 
  
    | 499 | # Hash slov zapise do ASF souboru (jeden soubor na slovo, obsahuje vsechny kandidaty daneho slova)
 | 
  
    | 500 | #
 | 
  
    | 501 | # @param
 | 
  
    | 502 | # @param
 | 
  
    | 503 | #
 | 
  
    | 504 | def store_to_asf(words, out_dpath) :
 | 
  
    | 505 | 
 | 
  
    | 506 |     ## Nova ASF trida
 | 
  
    | 507 |     #asfdata = asf.ASF()
 | 
  
    | 508 |     #asfdata.set_coment(('',
 | 
  
    | 509 |                         #'Seznam slov v souboru:',
 | 
  
    | 510 |                         #'\n'.join('    ' + w for w in words.iterkeys()),
 | 
  
    | 511 |                       #))
 | 
  
    | 512 | 
 | 
  
    | 513 |     # Vsechna slova
 | 
  
    | 514 |     for word_text, word_cands in words.items() :
 | 
  
    | 515 |         # Nova ASF trida
 | 
  
    | 516 |         asfdata = asf.ASF()
 | 
  
    | 517 |         # Vsechny kandidaty
 | 
  
    | 518 |         for cand in word_cands :
 | 
  
    | 519 |             sentence = cand[wordattr_sentence]
 | 
  
    | 520 |             # Prida vetu, neni-li jeste ulozena
 | 
  
    | 521 |             if not asfdata.getutts().has_key(sentence) :
 | 
  
    | 522 |                asfdata.append_utt(sentence)
 | 
  
    | 523 |             # A pridej jednotlive priznaky jednotek do teto vety
 | 
  
    | 524 |             for u in cand[wordattr_units] :
 | 
  
    | 525 |                 asfdata.add_attribs(u, sentence)
 | 
  
    | 526 | 
 | 
  
    | 527 |         # Uloz ASF do souboru
 | 
  
    | 528 |         asfdata.write_asf(os.path.join(out_dpath, 'words_{}.asf'.format(word_text)))
 | 
  
    | 529 | 
 | 
  
    | 530 | 
 | 
  
    | 531 | ##
 | 
  
    | 532 | # Rozsireny signal slov z hashe slov (wordattr_speech_ctx) zapise do wav souboru (jeden soubor na slovo a kandidata). Take
 | 
  
    | 533 | # ulozi .lab soubory citelne ve wavesurferu s informaci o hranicich jednotek.
 | 
  
    | 534 | #
 | 
  
    | 535 | # @param
 | 
  
    | 536 | # @param
 | 
  
    | 537 | #
 | 
  
    | 538 | def store_to_wav(words, out_dpath) :
 | 
  
    | 539 | 
 | 
  
    | 540 |     # Vsechna slova a vsechny kandidaty
 | 
  
    | 541 |     for word_text, word_cands in words.items() :
 | 
  
    | 542 | #        for cand_data, cand_indx in [(word_cands[i], i) for i in range(len(word_cands))] :
 | 
  
    | 543 |         for cand_data in word_cands :
 | 
  
    | 544 |             # Jmeno souboru
 | 
  
    | 545 | #            fname  = word_text + "_%02d_%s_to_listen" % (cand_indx +1, cand_data[wordattr_sentence])
 | 
  
    | 546 |             fname  = word_text + "_%02d_%s_to_listen" % (cand_data[wordattr_index], cand_data[wordattr_sentence])
 | 
  
    | 547 | 
 | 
  
    | 548 |             # Vytvor wav (s kontextem!). Proste spojeni MUSI dat cele slovo, jak je psano v dokumentaci ;-)
 | 
  
    | 549 |             # napln cas, kdy zacita skutecne slovo
 | 
  
    | 550 |             sampdata = cand_data[wordattr_speech_lctx] + cand_data[wordattr_speech] + cand_data[wordattr_speech_rctx]
 | 
  
    | 551 |             sampfreq = cand_data[wordattr_sampfreq]
 | 
  
    | 552 |             # Uloz wav
 | 
  
    | 553 |             wavext.WriteWav(os.path.join(out_dpath, fname + ".wav"), sampdata, sampfreq, 1)
 | 
  
    | 554 | 
 | 
  
    | 555 |             # Vytvor data pro .lab soubor. Obsahuje casy hranic fonu a jejich labely
 | 
  
    | 556 |             lctxlen  =   len(cand_data[wordattr_speech_lctx]) / float(cand_data[wordattr_sampfreq])
 | 
  
    | 557 |             begtime  = float(cand_data[wordattr_units][0][mlf.Mlf.attr_begTime])
 | 
  
    | 558 |             # Casy labelu posun tak, aby ukazovaly presne na zacatek slova (tedy ZA kontext)
 | 
  
    | 559 |             labdata = ["%f %f %s" % (float(u[mlf.Mlf.attr_begTime]) -begtime +lctxlen, float(u[mlf.Mlf.attr_endTime]) -begtime +lctxlen, u[mlfattr_modelName]) for u in cand_data[wordattr_units]]
 | 
  
    | 560 | 
 | 
  
    | 561 |             # Uloz .lab soubor
 | 
  
    | 562 |             labfile = open(os.path.join(out_dpath, fname + ".lab"), 'wt')
 | 
  
    | 563 |             labfile.write("\n".join(labdata))
 | 
  
    | 564 |             labfile.close()
 | 
  
    | 565 | 
 | 
  
    | 566 | 
 | 
  
    | 567 | ##
 | 
  
    | 568 | # Pro dane slovo vypise v jakych se nachazi pozicich
 | 
  
    | 569 | #
 | 
  
    | 570 | # @param words_dict hash se slovy vygenerovana timto projektem
 | 
  
    | 571 | # @param word_text text slova
 | 
  
    | 572 | #
 | 
  
    | 573 | def print_wordstats(words, word_text) :
 | 
  
    | 574 | 
 | 
  
    | 575 |     print "Informace pro slovo:    %s" % word_text
 | 
  
    | 576 |     print "    pocet realizaci:    %d" % len(words[word_text])
 | 
  
    | 577 |     print "--"
 | 
  
    | 578 | #    for p in words[word_text] :
 | 
  
    | 579 | #        print "    typ prozodemu:      %s" % p[attr_prosodemeType]
 | 
  
    | 580 | #        print "    delka prozodemu:    %d" % p[attr_prosodemeLen]
 | 
  
    | 581 | #        print "    pozice v prozodemu: %d" % p[attr_prosodemePos]
 | 
  
    | 582 | #        print "    veta:               %s" % p[attr_sentence]
 | 
  
    | 583 | #        print "--"
 | 
  
    | 584 | 
 | 
  
    | 585 | 
 | 
  
    | 586 | ##
 | 
  
    | 587 | # Vypise vsechna slova ktera se vyskytuji v danych prozodemech
 | 
  
    | 588 | #
 | 
  
    | 589 | # @param words_dict hash se slovy vygenerovana timto projektem
 | 
  
    | 590 | # @param prosodemes hash {typ_prozodemu : min_vyskytu, ...}. Pokud je min_vyskytu None, berou se v potaz vsechny vyskyty
 | 
  
    | 591 | #
 | 
  
    | 592 | def print_prosodemestats(words, prosodemes) :
 | 
  
    | 593 | 
 | 
  
    | 594 |     # Prochazej slova
 | 
  
    | 595 |     for word_text, word_insts in words.items() :
 | 
  
    | 596 |         # Hash pozadovanych typu
 | 
  
    | 597 |         types = dict.fromkeys(prosodemes.keys(), [])
 | 
  
    | 598 |         # Projede vsechny instance slova a ukladej slova, ktera jsou v pozadovanem typu
 | 
  
    | 599 |         for word_data in word_insts :
 | 
  
    | 600 |             curr_type = dict.fromkeys([w[mlfattr_prosodmType] for w in word_data[wordattr_units] if w.has_key(mlfattr_prosodmType)], 0)
 | 
  
    | 601 |             # Ma slovo atribut typ prozodemu? Pokud ano, musi mit hash s aktualnim typem prozodemu prave jeden zaznam!
 | 
  
    | 602 |             if len(curr_type.keys()) != 1  :
 | 
  
    | 603 |                raise Exception, "Chybi atribut typu pros. slova '%s' ve slove '%s', nebo slovo patri do vice prozodemu %s" % (mlfattr_prosodmType, word_text, str(curr_type.keys()))
 | 
  
    | 604 |             # Nastav aktualni (ten jeden) typ prozodemu
 | 
  
    | 605 |             curr_type = curr_type.keys()[0]
 | 
  
    | 606 | 
 | 
  
    | 607 |             # Pokud je typ ten, ktery hledame, pridej slovo do pole. Je treba pridavat takto, jinak se prida ke vsem typum,
 | 
  
    | 608 |             # viz dict.fromkeys()
 | 
  
    | 609 |             if prosodemes.has_key(curr_type) :
 | 
  
    | 610 |                types[curr_type] = types[curr_type] + [word_data, ]
 | 
  
    | 611 | 
 | 
  
    | 612 |         #print [(key, prosodemes[key] <= len(types[key]), len(types[key])) for key in prosodemes.keys()]
 | 
  
    | 613 | 
 | 
  
    | 614 |         # Pokud existuji hodnoty, ktere nedosahly pozadovaneho poctu kandidatu, ignoruj
 | 
  
    | 615 |         if False in [prosodemes[key] <= len(types[key]) for key in prosodemes.keys()] :
 | 
  
    | 616 |            continue
 | 
  
    | 617 | 
 | 
  
    | 618 |         # Jinak vypis
 | 
  
    | 619 |         print "Informace pro slovo:    %s" % word_text
 | 
  
    | 620 |         print "    pocet realizaci:    %d" % len(word_insts)
 | 
  
    | 621 |         print "--"
 | 
  
    | 622 |         for type_key, insts in types.items() :
 | 
  
    | 623 |             print "    typ prozodemu:      %s" %     type_key
 | 
  
    | 624 |             print "    pocet realizaci:    %d" % len(insts)
 | 
  
    | 625 | #            print "    vety:               %s" % str([w[wordattr_sentence] for w in insts])
 | 
  
    | 626 |             print "--"
 | 
  
    | 627 | 
 | 
  
    | 628 | 
 | 
  
    | 629 | 
 | 
  
    | 630 | #
 | 
  
    | 631 | # --------- MAIN -------------
 | 
  
    | 632 | #
 | 
  
    | 633 | if __name__ == "__main__" :
 | 
  
    | 634 |    main()
 |