| 1 | # coding: utf-8
 | 
  
    | 2 | 
 | 
  
    | 3 | import argparse
 | 
  
    | 4 | import codecs
 | 
  
    | 5 | import asflight
 | 
  
    | 6 | 
 | 
  
    | 7 | 
 | 
  
    | 8 | # monosyllabic prepositions + conjuctions a, i
 | 
  
    | 9 | proclitics_ver1 = [
 | 
  
    | 10 |     u"a", u"i", u"bez", u"či", u"dle", u"do", u"k", u"ke", u"kol", u"ku",
 | 
  
    | 11 |     u"na", u"nad", u"o", u"ob", u"od", u"po", u"pod", u"pro", u"před", u"přes", u"při",
 | 
  
    | 12 |     u"s", u"se", u"skrz", u"u", u"v", u"ve", u"vně", u"z", u"za", u"ze", u"zpod" ]
 | 
  
    | 13 | 
 | 
  
    | 14 | # version #1 + monosyllabic relative pronouns
 | 
  
    | 15 | proclitics_ver2 = [
 | 
  
    | 16 |     u"a", u"i", u"bez", u"či", u"dle", u"do", u"k", u"ke", u"kol", u"ku",
 | 
  
    | 17 |     u"na", u"nad", u"o", u"ob", u"od", u"po", u"pod", u"pro", u"před", u"přes", u"při",
 | 
  
    | 18 |     u"s", u"se", u"skrz", u"u", u"v", u"ve", u"vně", u"z", u"za", u"ze", u"zpod",
 | 
  
    | 19 |     u"kdo", u"co", u"čí", u"jenž", u"jež", u"již", u"jíž", u"jichž", u"jimž",
 | 
  
    | 20 |     u"němž", u"níž", u"nichž", u"jímž", u"kom", u"kým", u"čem", u"čím", u"čím", u"čích" ]
 | 
  
    | 21 | 
 | 
  
    | 22 | # list of enclitic pronouns
 | 
  
    | 23 | enclitics = [ u"se", u"si" ]
 | 
  
    | 24 | 
 | 
  
    | 25 | 
 | 
  
    | 26 | # unit keys: phone, pwordBoundPos, pphrsBoundPos, prosodeme, word
 | 
  
    | 27 | 
 | 
  
    | 28 | 
 | 
  
    | 29 | def set_pwords( unit_list, ver=1 ):
 | 
  
    | 30 | 
 | 
  
    | 31 |     if ver == 1:
 | 
  
    | 32 |         proclitics = proclitics_ver1
 | 
  
    | 33 |     elif ver == 2:
 | 
  
    | 34 |         proclitics = proclitics_ver2
 | 
  
    | 35 | 
 | 
  
    | 36 |     # ----- process proclitics
 | 
  
    | 37 | 
 | 
  
    | 38 |     unit_prev = None
 | 
  
    | 39 |     append = False
 | 
  
    | 40 | 
 | 
  
    | 41 |     for unit in unit_list:
 | 
  
    | 42 | 
 | 
  
    | 43 |         if unit['pwordBoundPos'] == 'P':
 | 
  
    | 44 |             if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ):
 | 
  
    | 45 |                 unit_prev['pwordBoundPos'] = 'L'
 | 
  
    | 46 |             append = False
 | 
  
    | 47 | 
 | 
  
    | 48 |         elif ( unit['word'] is not None ) and ( unit['word'] != '.' ):
 | 
  
    | 49 | 
 | 
  
    | 50 |             if append is False:
 | 
  
    | 51 |                 if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ):
 | 
  
    | 52 |                     unit_prev['pwordBoundPos'] = 'L'
 | 
  
    | 53 |                 unit['pwordBoundPos'] = 'F'
 | 
  
    | 54 | 
 | 
  
    | 55 |             else:
 | 
  
    | 56 |                 if ( unit_prev is not None ) and ( unit_prev['pwordBoundPos'] != 'P' ) and ( unit_prev['pwordBoundPos'] != 'F' ):
 | 
  
    | 57 |                      unit_prev['pwordBoundPos'] = "-"
 | 
  
    | 58 |                 unit['pwordBoundPos'] = "-"
 | 
  
    | 59 | 
 | 
  
    | 60 |             append = ( unit['word'].lower() in proclitics )
 | 
  
    | 61 | 
 | 
  
    | 62 |         unit_prev = unit
 | 
  
    | 63 | 
 | 
  
    | 64 |     # ----- process enclitic
 | 
  
    | 65 | 
 | 
  
    | 66 |     append = False
 | 
  
    | 67 |     phr_end = False
 | 
  
    | 68 | 
 | 
  
    | 69 |     for unit in reversed( unit_list ):
 | 
  
    | 70 | 
 | 
  
    | 71 |         if ( unit['pphrsBoundPos'] == 'L' ):
 | 
  
    | 72 |             phr_end = True
 | 
  
    | 73 | 
 | 
  
    | 74 |         if append:
 | 
  
    | 75 |              unit['pwordBoundPos'] = '-'
 | 
  
    | 76 |              append = False
 | 
  
    | 77 | 
 | 
  
    | 78 |         if phr_end and ( unit['word'] is not None ) and ( unit['word'] != "." ):
 | 
  
    | 79 |             append = ( unit['word'].lower() in enclitics )
 | 
  
    | 80 |             if append:
 | 
  
    | 81 |                 unit['pwordBoundPos'] = '-'
 | 
  
    | 82 |             phr_end = False
 | 
  
    | 83 | 
 | 
  
    | 84 |     # ----- correction of prosodemes
 | 
  
    | 85 | 
 | 
  
    | 86 |     prosodeme = None
 | 
  
    | 87 |     phr_end = False
 | 
  
    | 88 | 
 | 
  
    | 89 |     for unit in reversed( unit_list ):
 | 
  
    | 90 | 
 | 
  
    | 91 |         if ( unit['pphrsBoundPos'] == 'L' ):
 | 
  
    | 92 |             prosodeme = unit['prosodeme']
 | 
  
    | 93 | 
 | 
  
    | 94 |         if prosodeme is not None:
 | 
  
    | 95 |             unit['prosodeme'] = prosodeme
 | 
  
    | 96 | 
 | 
  
    | 97 |             if unit['pwordBoundPos'] == 'F':
 | 
  
    | 98 |                 prosodeme = None
 | 
  
    | 99 | 
 | 
  
    | 100 |         elif ( unit['prosodeme'] != '0' ) and ( unit['prosodeme'] != 'X.X' ):
 | 
  
    | 101 |             unit['prosodeme'] = '0'
 | 
  
    | 102 | 
 | 
  
    | 103 | 
 | 
  
    | 104 | # ----------
 | 
  
    | 105 | 
 | 
  
    | 106 | 
 | 
  
    | 107 | def main():
 | 
  
    | 108 | 
 | 
  
    | 109 |     parser = argparse.ArgumentParser( description="Modify prosodic words in ASF file." )
 | 
  
    | 110 | 
 | 
  
    | 111 |     parser.add_argument( type=str, metavar="ASF_IN", dest="asfIn", help="input ASF file" )
 | 
  
    | 112 |     parser.add_argument( type=str, metavar="ASF_OUT", dest="asfOut", help="output ASF file" )
 | 
  
    | 113 |     parser.add_argument( "-c", "--code-page", type=str, metavar="CODEPAGE", dest="codePage",
 | 
  
    | 114 |                          help="encoding of all files, default value: %(default)s", default='utf-8' )
 | 
  
    | 115 | 
 | 
  
    | 116 |     args = parser.parse_args()
 | 
  
    | 117 | 
 | 
  
    | 118 |     asf = asflight.AsfLight( args.asfIn, args.codePage )
 | 
  
    | 119 | 
 | 
  
    | 120 |     for utt_name in asf:
 | 
  
    | 121 |         set_pwords( asf[ utt_name ] )
 | 
  
    | 122 | 
 | 
  
    | 123 |     asf.write( args.asfOut, args.codePage )
 | 
  
    | 124 | 
 | 
  
    | 125 | 
 | 
  
    | 126 | # ----------
 | 
  
    | 127 | 
 | 
  
    | 128 | # run the main program
 | 
  
    | 129 | if ( __name__ == "__main__" ):
 | 
  
    | 130 |     main()
 |