| 1 | #!/usr/bin/python2
 | 
  
    | 2 | 
 | 
  
    | 3 | import codecs
 | 
  
    | 4 | import glob
 | 
  
    | 5 | import json
 | 
  
    | 6 | 
 | 
  
    | 7 | import asf_new as asf  # dummy asf_new.py. To be replace just by asf
 | 
  
    | 8 | 
 | 
  
    | 9 | fmask = 'words_*.asf'
 | 
  
    | 10 | omask = 'words_{}_{}-{}'  # Mask for the word
 | 
  
    | 11 | 
 | 
  
    | 12 | # ASF items confersion
 | 
  
    | 13 | iconv = {'phone'         : unicode,
 | 
  
    | 14 |          'word'          : unicode,
 | 
  
    | 15 |          'prosodeme'     : unicode,
 | 
  
    | 16 |          'concatenate'   : unicode,
 | 
  
    | 17 |          'suspicious'    : unicode,
 | 
  
    | 18 |          'pphrsBoundPos' : unicode,
 | 
  
    | 19 |          'pwordBoundPos' : unicode,
 | 
  
    | 20 |          'mlfEndTime'    : float,
 | 
  
    | 21 |          'mlfBegTime'    : float,
 | 
  
    | 22 |          'HTKScore'      : unicode,  # Not used as number here
 | 
  
    | 23 |         }
 | 
  
    | 24 | 
 | 
  
    | 25 | 
 | 
  
    | 26 | def main() :
 | 
  
    | 27 | 
 | 
  
    | 28 |     # Nacte ASF soubory
 | 
  
    | 29 |     # - co soubor, to stejne slovo
 | 
  
    | 30 |     for fname in glob.glob(fmask) :
 | 
  
    | 31 | 
 | 
  
    | 32 |         # Read the ASF data
 | 
  
    | 33 |         asfdata = asf.ASF(attrconv = iconv)
 | 
  
    | 34 |         asfdata.read_file(fname)
 | 
  
    | 35 |         # Individual middle-phone splits for the given word
 | 
  
    | 36 |         splits  = {}
 | 
  
    | 37 | 
 | 
  
    | 38 |         # Jmeno je foneticky, ale ma u sebe prefix a suffix, ktery se musi opravit
 | 
  
    | 39 |         word    = fname
 | 
  
    | 40 |         word    = word.replace('words_', '') # Odstran zacatek
 | 
  
    | 41 |         word    = word.replace('.asf',  '') # Odstran konec
 | 
  
    | 42 |         phntext = '|{}|'.format(word)
 | 
  
    | 43 | 
 | 
  
    | 44 |         # Get individual concatenation parts
 | 
  
    | 45 |         for sent,segs in asfdata.iteritems() :
 | 
  
    | 46 |             # Find the split unit
 | 
  
    | 47 |             split,_ = segs.find_exact('concatenate', '*><*')
 | 
  
    | 48 |             part1   = segs[0:split+1]
 | 
  
    | 49 |             part2   = segs[split:]
 | 
  
    | 50 |             # Convert them to diphones
 | 
  
    | 51 |             part1   = to_diphs(part1, sent)
 | 
  
    | 52 |             part2   = to_diphs(part2, sent)
 | 
  
    | 53 | 
 | 
  
    | 54 |             # Store the parts
 | 
  
    | 55 |             splits[sent] = (part1, part2)
 | 
  
    | 56 | 
 | 
  
    | 57 |         # Zkombinuj ruzne casti slova do JSON struktury
 | 
  
    | 58 |         for sent1,(part1,part2) in splits.iteritems() :
 | 
  
    | 59 |             for sent2,(partA,partB) in splits.iteritems() :
 | 
  
    | 60 |                 # Ignore the matching parts
 | 
  
    | 61 |                 if sent1 == sent2 :
 | 
  
    | 62 |                    continue
 | 
  
    | 63 | 
 | 
  
    | 64 |                 # Output file
 | 
  
    | 65 |                 ofile = omask.format(word, sent1, sent2)
 | 
  
    | 66 | 
 | 
  
    | 67 |                 # Join the parts
 | 
  
    | 68 |                 join  = part1 + partB
 | 
  
    | 69 |                 # Add first and final pause
 | 
  
    | 70 |                 join.insert(0, {'diphone' : '${}'.format(join[ 0]['l-phone'])})
 | 
  
    | 71 |                 join.append(   {'diphone' : '{}$'.format(join[-1]['r-phone'])})
 | 
  
    | 72 | 
 | 
  
    | 73 |                 # Build JSON from it
 | 
  
    | 74 |                 units = []
 | 
  
    | 75 |                 data  = {'phntext' :  phntext,
 | 
  
    | 76 |                          'phrtype' :  to_phrtype(join),
 | 
  
    | 77 |                          'phrlctx' : '$',
 | 
  
    | 78 |                          'phrrctx' : '$',
 | 
  
    | 79 |                          'units'   :  units,
 | 
  
    | 80 |                          'File'    :  ofile + '.wav',
 | 
  
    | 81 |                         }
 | 
  
    | 82 |                 # Add units
 | 
  
    | 83 |                 for u in join :
 | 
  
    | 84 |                     u =  dict(u)  # Make copy. Will be changed
 | 
  
    | 85 |                     d =  u['diphone']
 | 
  
    | 86 |                     # Remove all not-required keys
 | 
  
    | 87 |                     for k in set(u.keys()).difference(('BegTime', 'EndTime', 'Sentence')) :
 | 
  
    | 88 |                         del u[k]
 | 
  
    | 89 |                     # Add to JSON
 | 
  
    | 90 |                     item = {'name'  :  d, }
 | 
  
    | 91 |                     # Add the info for cndidate to be kep, if there is such
 | 
  
    | 92 |                     if u :
 | 
  
    | 93 |                        item['keep'] = (u, )
 | 
  
    | 94 |                     # Add it to the data
 | 
  
    | 95 |                     units.append(item)
 | 
  
    | 96 | 
 | 
  
    | 97 |                 # Store the JSON to disc. Wrap data to a sequence as JSON must be a sequence
 | 
  
    | 98 |                 json.dump((data, ), codecs.open(ofile + '.json', 'wt', 'utf8'), sort_keys = True, indent = 4)
 | 
  
    | 99 | #                # Add it to the main json
 | 
  
    | 100 | #                sdata.append(data)
 | 
  
    | 101 | 
 | 
  
    | 102 | #    # Store the JSON to disc
 | 
  
    | 103 | #    json.dump(sdata, codecs.open(ojson, 'wt', 'utf8'), sort_keys = True, indent = 4)
 | 
  
    | 104 | 
 | 
  
    | 105 | 
 | 
  
    | 106 | ## Posloupnost fonu prevede do posloupnosti difonu
 | 
  
    | 107 | #
 | 
  
    | 108 | def to_diphs(units, sent) :
 | 
  
    | 109 |     # Copy
 | 
  
    | 110 |     uout = []
 | 
  
    | 111 |     # Build diphones, copying the base features required for JSon
 | 
  
    | 112 |     for u1,u2 in zip(units[:-1], units[1:]) :
 | 
  
    | 113 |         uout.append({'Sentence'  :  sent,
 | 
  
    | 114 |                      'BegTime'   : (u1['mlfBegTime'] + u1['mlfEndTime']) / 2.0,  # Middle of 1st phone
 | 
  
    | 115 |                      'EndTime'   : (u2['mlfBegTime'] + u2['mlfEndTime']) / 2.0,  # Middle of 2nd phone
 | 
  
    | 116 |                      'diphone'   :  u1['phone']      + u2['phone'],
 | 
  
    | 117 |                      'l-phone'   :  u1['phone'],
 | 
  
    | 118 |                      'r-phone'   :  u2['phone'],
 | 
  
    | 119 |                      'prosodeme' :  u2['prosodeme'],
 | 
  
    | 120 |                     })
 | 
  
    | 121 |     # Vrati seznam
 | 
  
    | 122 |     return uout
 | 
  
    | 123 | 
 | 
  
    | 124 | ## Z posledni ne-pauza jednotky urci typ fraze
 | 
  
    | 125 | #
 | 
  
    | 126 | def to_phrtype(units) :
 | 
  
    | 127 |     for u in reversed(units) :
 | 
  
    | 128 |         if   u.get('phone', ' ')       in '$%#' :
 | 
  
    | 129 |              continue
 | 
  
    | 130 |         if   u.get('diphone', ' ')[-1] in '$%#' :
 | 
  
    | 131 |              continue
 | 
  
    | 132 |         # Get the type
 | 
  
    | 133 |         p  = u.get('prosodeme', '?? NOT SET ??')
 | 
  
    | 134 |         # Convert the type
 | 
  
    | 135 |         if   p == '3.1' : return '3'
 | 
  
    | 136 |         elif p == '1.1' : return '1'
 | 
  
    | 137 |         elif p == '2.2' : return '2'
 | 
  
    | 138 |         elif p == '0'   : return '0'
 | 
  
    | 139 |         else            : raise ValueError('Unknown prosodeme {} for unit {}'.format(p, u))
 | 
  
    | 140 | 
 | 
  
    | 141 |     # Not found?
 | 
  
    | 142 |     raise ValueError('Unknown prosodeme for units {}'.format(units))
 | 
  
    | 143 | 
 | 
  
    | 144 | #
 | 
  
    | 145 | # ---------
 | 
  
    | 146 | #
 | 
  
    | 147 | if __name__ == '__main__' :
 | 
  
    | 148 |      main()
 |