FreeLing  3.0
phd.h
Go to the documentation of this file.
00001 /*
00002  * Phonetic Distance Scorer for the PHAST package
00003  * + See the features.ALL file for input description
00004  * + April 2006 pcomas@lsi.upc.edu
00005  *
00006  */
00007 #ifndef _phd_h
00008 #define _phd_h
00009 
00010 #include <cstdlib>
00011 #include <iostream>
00012 #include <sstream>
00013 #include <fstream>
00014 #include <map>
00015 #include <set>
00016 #include <string>
00017 #include <math.h>
00018 
00019 #include "freeling/morfo/util.h"
00020 
00021 
00022 #define ALPHSIZE 128
00023 
00024 template <typename T=int> 
00025   class phd {
00026 
00027  private:
00028  T csub, cexp, cvowel, cskip, cspace;
00029  T distance[ALPHSIZE][ALPHSIZE];
00030  std::set<wchar_t> svowels;      // set of vowel phonemes
00031  std::set<wchar_t> sconsonants;  // set of consonant phonemes
00032  int debug;
00033 
00034  inline T V(wchar_t a){ return svowels.find(a) != svowels.end() ? cvowel : 0; }
00035 
00036  public:
00037 
00038  phd(const std::wstring &fname){
00039 
00040    debug = 0;
00041    std::wstring s;
00042    wchar_t c;
00043    T t;
00044    int i,fcount;
00045    std::map<const std::wstring, int> flist;   // set of the features' names with its index
00046    std::map<const std::wstring, T> fweight; // set of the features' saliences
00047    std::map<const std::wstring, T> values;  // set of the numerical values of the multivaluated features
00048    std::set<std::wstring> svfeatures;  // set of attributes for vowel comparison
00049    std::set<std::wstring> scfeatures;  // set of attributes for other comparisons
00050    csub = 0;
00051    cskip = 0;
00052    cexp = 0;
00053    cvowel = 0;
00054 
00055    /**************************************************************
00056     *
00057     * READ INPUT FILES, BUILD MATRIX OF FEATURES
00058     *
00059     **************************************************************/
00060 
00061    T features [ALPHSIZE][ALPHSIZE];
00062 
00063    std::wifstream is;
00064    util::open_utf8_file(is,fname);
00065    if (is.fail()) {
00066      std::wcerr<<L"PHONETIC_DISTANCE: Error opening file "+fname;
00067      exit(1);
00068    }
00069 
00070    fcount = 0; 
00071 
00072    while(!is.eof()){
00073 
00074      is >> s;
00075 
00076      if( s[0] == L'#'){ 
00077        getline(is,s);
00078 
00079      } else if( s==L"FON:") {
00080        is >> c;     // this is the phoneme
00081        //cerr << "FONEMA "<< c << endl;
00082        getline(is,s); 
00083        std::wstringstream ss(s,std::stringstream::in);
00084        i = 0;
00085        while(ss>>s){
00086          if(s==L"+"){
00087            features[(int)c][i] = 100;
00088          }else if(s==L"-"){
00089            features[(int)c][i] = 0;
00090          }else{  // is a multivaluated feature
00091            features[(int)c][i] = values[s];
00092          }
00093          //cerr << "Posant " << features[c][i] << " a " << i << " (" << s << ")"<< endl;
00094          i++;
00095        }
00096 
00097      } else if( s==L"VALUE:") {
00098        is >> s >> t; // feature value is i
00099        values[s] = t;
00100        //cerr << "VALUE ADD: " << s << " <-- " << i << endl;
00101 
00102      } else if( s==L"WEIGHT:") {
00103        is >> s >> t; // feature s weights i
00104        fweight[s] = t;
00105 
00106      } else if( s==L"CONSTANT:") {
00107        is >> s >> t; // s takes value i
00108        if (s==L"Cskip")   { cskip = t;}
00109        else if(s==L"Csub"){ csub  = t;}
00110        else if(s==L"Cexp"){  cexp = t;}
00111        else if(s==L"Cvowel"){ cvowel = t;}
00112        else if(s==L"Cspace"){ cspace = t;}
00113        else{ std::wcerr << L"UNEXPECTED CONSTANT DEFINITION" << s << std::endl; }
00114 
00115      } else if( s==L"VOWELS:") {
00116        //create a list with the vocalic phonemes
00117        getline(is,s); 
00118        std::wstringstream ss(s, std::wstringstream::in);
00119        while( ss>>c ){  svowels.insert(c); }
00120 
00121      } else if( s==L"CONSONANTS:") {
00122        //create a set with the consonantic phonemes
00123        getline(is,s); 
00124        std::wstringstream ss(s, std::wstringstream::in);
00125        while( ss>>c ){  sconsonants.insert(c); }
00126 
00127      } else if( s==L"FEATURES:") {
00128        //create a list with the index inside the matrix for each feature
00129        getline(is,s); 
00130        std::wstringstream ss(s, std::wstringstream::in);
00131        i = 0;
00132        while( ss>>s ){ flist[s]=i; i++; }
00133 
00134      } else if( s==L"FVOWELS:") {
00135        //create a set with 
00136        getline(is,s); 
00137        std::wstringstream ss(s, std::wstringstream::in);
00138        while( ss>>s ){ svfeatures.insert(s); }
00139 
00140      } else if( s==L"FOTHER:") {
00141        //create a set with 
00142        getline(is,s); 
00143        std::wstringstream ss(s, std::wstringstream::in);
00144        while( ss>>s ){ scfeatures.insert(s); }
00145 
00146      } else {
00147        //skip
00148      }
00149       
00150    }
00151     
00152    is.close();
00153 
00154 
00155    /**************************************************************
00156     *
00157     * BUILD MATRIX OF DISTANCES
00158     *
00159     **************************************************************/
00160    /*
00161     */
00162     
00163    std::set<wchar_t>::iterator it1;
00164    std::set<wchar_t>::iterator it2;
00165    std::set<std::wstring>::iterator it3;
00166    T d;
00167    int f;
00168 
00169    for(int i=0;i<ALPHSIZE;i++){
00170      for(int j=0;j<ALPHSIZE;j++){
00171        distance[i][j]= i==j ? 0 : (T)8000;
00172      }
00173    }
00174 
00175    //Build vowels vs vowels
00176 
00177    for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00178      for( it2 = svowels.begin(); it2!=it1; ++it2){
00179        //calculate distance between it1 and it2 using features in it3
00180        d=0;
00181        for(it3 = svfeatures.begin(); it3!=svfeatures.end(); ++it3){
00182          f = flist[(*it3)];
00183          d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00184        }
00185        distance[(int)(*it1)][(int)(*it2)] = d;
00186        distance[(int)(*it2)][(int)(*it1)] = d;
00187      }
00188    }
00189 
00190 
00191    //Build vowels vs consonants
00192    for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){
00193      for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00194        //calculate distance between it1 and it2 using features in it3
00195        d=0;
00196        for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){
00197          f = flist[(*it3)];
00198          d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00199        }
00200        distance[(int)(*it1)][(int)(*it2)] = d;
00201        distance[(int)(*it2)][(int)(*it1)] = d;
00202      }
00203    }
00204 
00205 
00206    //Build consonants vs consonants
00207    for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){
00208      for( it2 = sconsonants.begin(); it2!=it1; ++it2){
00209        //calculate distance between it1 and it2 using features in it3
00210        d=0;
00211        for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){
00212          f = flist[(*it3)];
00213          d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)];
00214        }
00215        distance[(int)(*it1)][(int)(*it2)] = d;
00216        distance[(int)(*it2)][(int)(*it1)] = d;
00217      }
00218    }
00219 
00220    if(debug>2){
00221      std::wcerr << L"\t";
00222      for( int i=85; i<ALPHSIZE; i++ ){
00223        std::wcerr << (wchar_t)i << L"\t";
00224      }
00225      std::wcerr << std::endl;
00226 
00227      for( int i=85; i<ALPHSIZE; i++ ){
00228        std::wcerr << (wchar_t)i << L"\t";
00229        for( int j=85; j<ALPHSIZE; j++ ){
00230          std::wcerr << distance[i][j] << L"\t";
00231        }
00232        std::wcerr << std::endl;
00233      }
00234 
00235    }
00236 
00237 
00238  } //constructor
00239 
00240 
00241  void show(std::wostream &o){
00242 
00243    std::set<wchar_t>::iterator it1;
00244    std::set<wchar_t>::iterator it2;
00245    std::set<std::wstring>::iterator it3;
00246 
00247    o << L"Distances between phonemes" << std::endl << L"==========================" << std::endl << std::endl;
00248 
00249    o << L"Read values: cskip:" << cskip << L", csub:" << csub << L", cexp:" << cexp << L", cvowel:" << cvowel << std::endl;
00250 
00251 
00252    o << L"\t";
00253    for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << L"\t";
00254    o << std::endl;
00255 
00256    for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00257      o << (*it1) << L"\t";
00258      for( it2 = svowels.begin(); it2!=it1; ++it2){
00259        o << distance[(int)(*it1)][(int)(*it2)] << L"\t";
00260      }
00261      o << std::endl;
00262    }
00263 
00264    o << std::endl << L"\t";
00265    for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << L"\t";
00266    o << std::endl;
00267 
00268    // vowels vs consonants
00269    for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){
00270      o << (*it2) << L"\t";
00271      for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){
00272        o << distance[(int)(*it1)][(int)(*it2)] << L"\t";
00273      }
00274      o << std::endl;
00275    }
00276 
00277    o << std::endl << L"\t";
00278    for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1) o << (*it1) << L"\t";
00279    o << std::endl;
00280 
00281    // consonants vs consonants
00282    for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){
00283      o << (*it1) << L"\t";
00284      for( it2 = sconsonants.begin(); it2!=it1; ++it2){
00285        o << distance[(int)(*it1)][(int)(*it2)] << L"\t";
00286      }
00287      o << std::endl;
00288    }
00289  }
00290 
00291 
00292  T getCskip(){
00293    return cskip;
00294  }
00295 
00296  T dSkip(int c){
00297    return c==L' ' || c==L'_' ? cskip+cspace : cskip;
00298    //return cskip;
00299  }
00300 
00301  T dSub(int const a, int const b){
00302    if( ( (wchar_t)a==L' ' || (wchar_t)a==L'_' ) && ( (wchar_t)b==L' ' || (wchar_t)b==L'_' ) ){ return cspace; }
00303    return (wchar_t)a==L'_' || (wchar_t)a==L' ' || (wchar_t)b==L' ' || (wchar_t)b==L'_' ? -cspace/2 : csub - distance[a][b] - V(a) - V(b);
00304  }
00305 
00306  T dExp(int const a, int const b, int const c){
00307    return cexp - distance[a][b] - distance[a][c] - V(a) - std::max(V(b),V(c));
00308  }
00309   
00310 };
00311 
00312 
00313 
00314 #endif