asf2json_mix.py - HQSYN16 - Projects of Department of Cybernetics & NTIS P1 - Cybernetic Systems, University of West Bohemia

Task #3855 » asf2json_mix.py

Tihelka Dan, 09.08.2016 15:15

    
      #!/usr/bin/python2

      import codecs

      import glob

      import json

      import asf_new as asf  # dummy asf_new.py. To be replace just by asf

      fmask = 'words_*.asf'

      omask = 'words_{}_{}-{}'  # Mask for the word

      # ASF items confersion

      iconv = {'phone'         : unicode,

               'word'          : unicode,

               'prosodeme'     : unicode,

               'concatenate'   : unicode,

               'suspicious'    : unicode,

               'pphrsBoundPos' : unicode,

               'pwordBoundPos' : unicode,

               'mlfEndTime'    : float,

               'mlfBegTime'    : float,

               'HTKScore'      : unicode,  # Not used as number here

              }

      def main() :

          # Nacte ASF soubory

          # - co soubor, to stejne slovo

          for fname in glob.glob(fmask) :

              # Read the ASF data

              asfdata = asf.ASF(attrconv = iconv)

              asfdata.read_file(fname)

              # Individual middle-phone splits for the given word

              splits  = {}

              # Jmeno je foneticky, ale ma u sebe prefix a suffix, ktery se musi opravit

              word    = fname

              word    = word.replace('words_', '') # Odstran zacatek

              word    = word.replace('.asf',  '') # Odstran konec

              phntext = '|{}|'.format(word)

              # Get individual concatenation parts

              for sent,segs in asfdata.iteritems() :

                  # Find the split unit

                  split,_ = segs.find_exact('concatenate', '*><*')

                  part1   = segs[0:split+1]

                  part2   = segs[split:]

                  # Convert them to diphones

                  part1   = to_diphs(part1, sent)

                  part2   = to_diphs(part2, sent)

                  # Store the parts

                  splits[sent] = (part1, part2)

              # Zkombinuj ruzne casti slova do JSON struktury

              for sent1,(part1,part2) in splits.iteritems() :

                  for sent2,(partA,partB) in splits.iteritems() :

                      # Ignore the matching parts

                      if sent1 == sent2 :

                         continue

                      # Output file

                      ofile = omask.format(word, sent1, sent2)

                      # Join the parts

                      join  = part1 + partB

                      # Add first and final pause

                      join.insert(0, {'diphone' : '${}'.format(join[ 0]['l-phone'])})

                      join.append(   {'diphone' : '{}$'.format(join[-1]['r-phone'])})

                      # Build JSON from it

                      units = []

                      data  = {'phntext' :  phntext,

                               'phrtype' :  to_phrtype(join),

                               'phrlctx' : '$',

                               'phrrctx' : '$',

                               'units'   :  units,

                               'File'    :  ofile + '.wav',

                              }

                      # Add units

                      for u in join :

                          u =  dict(u)  # Make copy. Will be changed

                          d =  u['diphone']

                          # Remove all not-required keys

                          for k in set(u.keys()).difference(('BegTime', 'EndTime', 'Sentence')) :

                              del u[k]

                          # Add to JSON

                          item = {'name'  :  d, }

                          # Add the info for cndidate to be kep, if there is such

                          if u :

                             item['keep'] = (u, )

                          # Add it to the data

                          units.append(item)

                      # Store the JSON to disc. Wrap data to a sequence as JSON must be a sequence

                      json.dump((data, ), codecs.open(ofile + '.json', 'wt', 'utf8'), sort_keys = True, indent = 4)

      #                # Add it to the main json

      #                sdata.append(data)

      #    # Store the JSON to disc

      #    json.dump(sdata, codecs.open(ojson, 'wt', 'utf8'), sort_keys = True, indent = 4)

      ## Posloupnost fonu prevede do posloupnosti difonu

      #

      def to_diphs(units, sent) :

          # Copy

          uout = []

          # Build diphones, copying the base features required for JSon

          for u1,u2 in zip(units[:-1], units[1:]) :

              uout.append({'Sentence'  :  sent,

                           'BegTime'   : (u1['mlfBegTime'] + u1['mlfEndTime']) / 2.0,  # Middle of 1st phone

                           'EndTime'   : (u2['mlfBegTime'] + u2['mlfEndTime']) / 2.0,  # Middle of 2nd phone

                           'diphone'   :  u1['phone']      + u2['phone'],

                           'l-phone'   :  u1['phone'],

                           'r-phone'   :  u2['phone'],

                           'prosodeme' :  u2['prosodeme'],

                          })

          # Vrati seznam

          return uout

      ## Z posledni ne-pauza jednotky urci typ fraze

      #

      def to_phrtype(units) :

          for u in reversed(units) :

              if   u.get('phone', ' ')       in '$%#' :

                   continue

              if   u.get('diphone', ' ')[-1] in '$%#' :

                   continue

              # Get the type

              p  = u.get('prosodeme', '?? NOT SET ??')

              # Convert the type

              if   p == '3.1' : return '3'

              elif p == '1.1' : return '1'

              elif p == '2.2' : return '2'

              elif p == '0'   : return '0'

              else            : raise ValueError('Unknown prosodeme {} for unit {}'.format(p, u))

          # Not found?

          raise ValueError('Unknown prosodeme for units {}'.format(units))

      #

      # ---------

      #

      if __name__ == '__main__' :

           main()

« Previous
1
2
Next »

(2-2/2)

Project

General

Profile

HQSYN16

Task #3855 » asf2json_mix.py