#!/usr/bin/python # # Author: Bill Poser (wjposer@unagi.cis.upenn.edu) # Linguistic Data Consortiumn # # Converts ISCII code to ITRANS ASCII representation # # Version 2.2.1 (2003/06/05) # # Command line flags: # # -lvu use upper case letters to represent long vowels rather than the default of double vowels (I rather than ii, etc.) # -nsh translate "soft halant" into a "hard halant", thus remaining within standard ITRANS, rather than emitting the extension ".hs" import sys import string #Initialize flags StrictP = 0 # T => abort on invalid input LongVowelsDoubleP = 1 # T => long vowels double. Else upper case. NoSoftHalantP = 0 # T => do not generate soft halant code - generate pure ITRANS #Flags for suspicious characters encountered ATREncounteredP = 0 C7EncounteredP = 0 CEEncounteredP = 0 D3EncounteredP = 0 EXTEncounteredP = 0 INVEncounteredP = 0 IllegalEncounteredP = 0 # Exit status ErrorCode = 0 # Default to error free # Create default translation table xtbl = { \ 0xa1 :".N", #chandrabindu 0xa2 :".n", #anuswara 0xa3 :"H", #visarga 0xb3 :"ka", 0xb4 :"kha", 0xb5 :"ga", 0xb6 :"gha", 0xb7 :"~Na", 0xb8 :"cha", 0xb9 :"Cha", 0xba :"ja", 0xbb :"Jha", 0xbc :"~na", 0xbd :"Ta", 0xbe :"Tha", 0xbf :"Da", 0xc0 :"Dha", 0xc1 :"Na", 0xc2 :"ta", 0xc3 :"tha", 0xc4 :"da", 0xc5 :"dha", 0xc6 :"na", 0xc8 :"pa", 0xc9 :"pha", 0xca :"ba", 0xcb :"bha", 0xcc :"ma", 0xcd :"ya", 0xcf :"ra", 0xd0 :"rra", # I'm not sure about this one. 0xd1 :"la", 0xd2 :"La", 0xd4 :"va", 0xd5 :"sha", 0xd6 :"Sha", 0xd7 :"sa", 0xd8 :"ha", 0xa4 :"a", 0xa5 :"A", # Independent vowels 0xa6 :"i", 0xa7 :"I", 0xa8 :"u", 0xa9 :"U", 0xaa :"ri", 0xac :"e", 0xad :"ai", 0xb0 :"o", 0xb1 :"au", 0xda :"A", # Vowel diacritics 0xdb :"i", 0xdc :"I", 0xdd :"u", 0xde :"U", 0xdf :"ri", 0xe1 :"e", 0xe2 :"ai", 0xe5 :"o", 0xe6 :"au", 0xE8 :".h", # explicit halant - most ISCII halant codes are not translated directly #Numerals 0xF1 :"0", 0xF2 :"1", 0xF3 :"2", 0xF4 :"3", 0xF5 :"4", 0xF6 :"5", 0xF7 :"6", 0xF8 :"7", 0xF9 :"8", 0xFA :"9", #Punctuation 0xEA :".", #daNDa #Nukta codes - these are NOT ISCII codes - they are just used internally #Possible in pure Hindi 0x8a :"fa", # voiceless labial fricative 0x8b :".da", # voiced retroflex flap 0x8c :".dha", # voiced aspirated retroflex flap #Urdu characters 0x8d :"qa", # voiceless uvular stop (corresponds to Urdu Qaf) 0x8e :"Ka", # voiceless velar fricative(corresponds to Urdu Khe) 0x8f :"Ga", # voiced velar or uvular fricative (corresponds to Urdu Ghain) 0x90 :"za", # voiced palatal fricative(corresponds to Urdu Zhe) (alt ITRANS J) #Sanskrit characters 0x91 :"RRI", # long syllabic /r/ vowel 0x92 :"RRI", # long syllabic /r/ sign 0x93 :"LLi", # syllabic /l/ vowel 0x94 :"LLi", # syllabic /l/ sign 0x95 :"LLI", # long syllabic /l/ vowel 0x96 :"LLI", # long syllabic /l/ sign 0x97 :".a", # avagraha 0x98 :"OM", # pranava (OM symbol) 0x99 :".hs" # soft halant (LDC extension - not standard ITRANS) } #Definitions of character classes WhiteSpace = [\ 0x20, # space 0x09, # tab 0x0D, # carriage return 0x0A, # line feed 0x0B, # vertical tab 0x0C # form feed ] Punctuation = [\ 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 0x60, 0x7B, 0x7C, 0x7D, 0x7E ] VowelSigns = [\ 0xda, # /aa/ 0xdb, # /i/ 0xdc, # /ii/ 0xdd, # /u/ 0xde, # /uu/ 0xe1, # /e/ 0xe2, # /ai/ 0xe5, # /o/ 0xe6, # /au/ 0xdf, # /ri/ 0x92, # /rri/ 0x94, # /lli/ 0x96 # /LLI/ ] #The code in the first column plus an immediately following nukta #is mapped to the code in the second column. The codes in the second #column are only used internally. They are illegal ISCII codes. NuktaCodeTbl = {\ 0xC9 : 0x8a, 0xBF : 0x8b, 0xC0 : 0x8c, 0xB3 : 0x8d, 0xB4 : 0x8e, 0xB5 : 0x8f, 0xBA : 0x90, 0xAA : 0x91, 0xDF : 0x92, 0xA6 : 0x93, 0xDA : 0x94, 0xA7 : 0x95, 0xDC : 0x96, 0xEA : 0x97, # avagraha 0xA1 : 0x98, # pranava 0xE8 : 0x99 # soft halant (to map soft halant to plain halant substitute 0xE8 for 0x99) } Word = [] #This subroutine takes the Devanagari word parsed out of the input stream #and translates it intro ITRANS code. def ProcessWord(DATransP): global Word global VowelSigns Output = "" nukta = [0x8a] # We just need something to initialize it to. wlen = len(Word) # Replace C-nukta combinations wtih special codes. # Note that this is done on Word, before we map Word to Output. i = 0 while i < wlen: if Word[i] == 0xE9: # nukta if i-1 >= 0: nukta[0] = NuktaCodeTbl[Word[i-1]] Word[i-1:i+1] = nukta wlen = len(Word) i = i + 1 # Now we start the actual translation to ITRANS code, as we proceed copying data from # Word to Output. wlen = len(Word) i = 0 while i < wlen: ch = Word[i] # First deal with halant. We don't have to deal with the case of halant + nukta ( "soft halant") # because we already handled it when we dealt with nukta. # If we have a single halant we just get rid of the default vowel. # If there are two halants in a row (explicit halant) we also emit code for a halant. if ch == 0xE8: # halant Output = Output[:-1] if i < wlen-1: # explicit (double) halant. We don't need to deal with soft if Word[i+1] == 0xE8: # halant because we already did it when we took care of nukta Output = Output + xtbl[0xE8] # Emit code for halant else: Output = Output + xtbl[ch] if ch in VowelSigns: if Output[len(Output)-2] == "a": # Remove the implicit /a/ and append the vowel sign. Output = Output[:-2] + xtbl[ch] # In Hindi, in contrast to Sanskrit, the implicit /a/ is not introduced at the end of the word. if i == wlen-1: if Output[-1] == "a": Output = Output[:-1] i = i + 1 if LongVowelsDoubleP == 1: ao = string.replace(Output,"A","aa") io = string.replace(ao,"I","ii") uo = string.replace(io,"U","uu") Output = uo sys.stdout.write(Output) del Output # Clear word del Word Word = [] if DATransP: sys.stdout.write("~}") return #End of subroutine ProcessWord #Hic incipit programma primum # Handle command line flags argc = len(sys.argv) i = 1 while i < argc: if sys.argv[i] == "-lvu": LongVowelsDoubleP = 0 elif sys.argv[i] == "-nsh": NoSoftHalantP = 1 i = i+1 if NoSoftHalantP: xtbl[0xE8] = 0xE8 # We parse the input into Devanagari words, and when we get a word, # pass it to a subroutine for transliteration. In the process we # pass on whitespace and characters whose transliteration is trivial # such as punctuation and numerals, and delete codes that cannot # be transliterated, such as ATR. We also take note of suspicious and # illegal codes, issue and issue warnings or abort as appropriate. # States: 1 => INWORD # 0 => NOTINWORD # We also keep track of whether we are in ASCII or ISCII so as to insert # HZ delimiters correctly. # 0 -> ASCII # 1 -> ISCII PrevType = 0 CurType = 0 skip = 0 State = 0 input = sys.stdin.read() for ch in input: c = ord(ch) # Keep track of whether we are in ISCII or ASCII range so that we can emit HZ # escapes. We treat whitespace characters as neutral. if not (c in WhiteSpace): if c > 0x7F: CurType = 1 else: CurType = 0 # By rights we ought to emit this here, but we can't because we need to wait # for the Devanagari word to be processed. So we'll have ProcessWord # do it instead. if CurType > PrevType: # transition to ISCII sys.stdout.write("~{") PrevType = CurType if c == 0x7E: # Escape tilde sys.stdout.write(0x7e) sys.stdout.write(0x7e) if skip: skip = 0 continue # Detect various codes that ought not to occur or cannot be processed normally. if c == 0xC7: C7EncounteredP = 1 if StrictP: break State = 0 if State == 1: ProcessWord(1) State = 0 continue if c == 0xCE: CEEncounteredP = 1 if StrictP: break State = 0 if State == 1: ProcessWord(1) State = 0 continue if c == 0xD3: D3EncounteredP = 1 if StrictP: break State = 0 if State == 1: ProcessWord(1) State = 0 continue if c == 0xD9: INVEncounteredP =1 continue if c == 0xEF: ATREncounteredP = 1 State = 1 skip = 1 # read next byte and discard it; continue if c == 0xF0: EXTEncounteredP = 1 State = 1 skip = 1 # read next byte and discard it; continue if (0x8A <= c <= 0x9F) or (0xEB <= c <= 0xEE) or (0xFB <= c <= 0xFE): IllegalEncounteredP = 1 if StrictP: break if State == 1: ProcessWord(0) State = 0 continue # Here is where the real work begins. # Valid Hindi ISCII characters are appended to to the current word. # Note that a few characters in this range were trapped above. # Therefore don't reorder this code. if 0xA1 <= c <= 0xFA: Word.append(c) State = 1 continue # ASCII alpha characters. Just pass on as is. if (0x61 <= c <= 0x7A) or (0x41 <= c <= 0x5A): if State == 1: ProcessWord(1) State = 0 sys.stdout.write(ch) state = 0 continue # ASCII numerals and punctuation # Numerals if (0x30 <= c <= 0x40) or c in Punctuation: if State == 1: ProcessWord(1) State = 0 sys.stdout.write(ch) continue # Whitespace has to be treated separately so as not to emit a # transition from DevaNagari to ASCII marker. if c in WhiteSpace: if State == 1: ProcessWord(0) State = 0 sys.stdout.write(ch) continue # Devanagari numerals or punctuation (daNDa) # Translate and emit. if ( 0xF1 <= c <= 0xFA) or (c == 0xEA): if State == 1: ProcessWord(0) state = 0 sys.stdout.write(xtbl[c]) continue #Although it looks at first as if the following two statements should be #in the opposite order, they are actually in the right order. # We need to make sure to close any open delimiters. We treat EOF # as ASCII. if PrevType == 1 and State == 0: sys.stdout.write("~}") # We have now read all input. # If we're still in a Devanagari word we need to process it. if State == 1: ProcessWord(1) #Report on any interesting characters encountered. if IllegalEncounteredP: sys.stderr.write("Illegal ISCII codes were encountered.\n") sys.stderr.write("This input is probably not ISCII.\n\n") if C7EncounteredP: sys.stderr.write("The code 0xC7 was encountered.\n") sys.stderr.write("This character is not normally found in Hindi.\n") sys.stderr.write("If this input is ISCII, it is probably Tamil.\n\n") if CEEncounteredP: sys.stderr.write("The code 0xCE was encountered.\n") sys.stderr.write("This character is not normally found in Hindi.\n") sys.stderr.write("If this input is ISCII, it is probably Assamese, Bengali, or Oriya\n\n") if D3EncounteredP: sys.stderr.write("The code 0xD3 was encountered.\n") sys.stderr.write("This character is not normally found in Hindi.\n") sys.stderr.write("If this input is ISCII, it is probably Tamil or Malayalam\n\n") if EXTEncounteredP: sys.stderr.write("The code 0xF0 (EXT) was encountered.\n") sys.stderr.write("This code cannot be transliterated.\n") sys.stderr.write("It should not appear in normal Hindi text.\n") sys.stderr.write("It is possible that the input contains embeded Vedic Sanskrit.\n\n") if ATREncounteredP: sys.stderr.write("The code 0xEF (ATR) was encountered.\n") sys.stderr.write("This code cannot be transliterated.\n") sys.stderr.write("It is used to define font attributes.\n\n") if INVEncounteredP: sys.stderr.write("The code 0xD9 (INV) was encountered.\n") sys.stderr.write("This code cannot be transliterated properly.\n") sys.stderr.write("There is no Itrans representation for it.\n") sys.stderr.write("It is used for display purposes, to cause part of a Devanagari\n") sys.stderr.write("character to be omitted.\n") if StrictP: if IllegalEncounteredP or C7EncounteredP or CEEncounteredP or D3EncounteredP or EXTEncounteredP\ or (NoAsciiP and AsciiAlphaEncounteredP): ErrorCode = 1 sys.exit(ErrorCode)