|
FreeLing
3.0
|
00001 /* 00002 * Phonetic Distance Scorer for the PHAST package 00003 * + See the features.ALL file for input description 00004 * + April 2006 pcomas@lsi.upc.edu 00005 * 00006 */ 00007 #ifndef _phd_h 00008 #define _phd_h 00009 00010 #include <cstdlib> 00011 #include <iostream> 00012 #include <sstream> 00013 #include <fstream> 00014 #include <map> 00015 #include <set> 00016 #include <string> 00017 #include <math.h> 00018 00019 #include "freeling/morfo/util.h" 00020 00021 00022 #define ALPHSIZE 128 00023 00024 template <typename T=int> 00025 class phd { 00026 00027 private: 00028 T csub, cexp, cvowel, cskip, cspace; 00029 T distance[ALPHSIZE][ALPHSIZE]; 00030 std::set<wchar_t> svowels; // set of vowel phonemes 00031 std::set<wchar_t> sconsonants; // set of consonant phonemes 00032 int debug; 00033 00034 inline T V(wchar_t a){ return svowels.find(a) != svowels.end() ? cvowel : 0; } 00035 00036 public: 00037 00038 phd(const std::wstring &fname){ 00039 00040 debug = 0; 00041 std::wstring s; 00042 wchar_t c; 00043 T t; 00044 int i,fcount; 00045 std::map<const std::wstring, int> flist; // set of the features' names with its index 00046 std::map<const std::wstring, T> fweight; // set of the features' saliences 00047 std::map<const std::wstring, T> values; // set of the numerical values of the multivaluated features 00048 std::set<std::wstring> svfeatures; // set of attributes for vowel comparison 00049 std::set<std::wstring> scfeatures; // set of attributes for other comparisons 00050 csub = 0; 00051 cskip = 0; 00052 cexp = 0; 00053 cvowel = 0; 00054 00055 /************************************************************** 00056 * 00057 * READ INPUT FILES, BUILD MATRIX OF FEATURES 00058 * 00059 **************************************************************/ 00060 00061 T features [ALPHSIZE][ALPHSIZE]; 00062 00063 std::wifstream is; 00064 util::open_utf8_file(is,fname); 00065 if (is.fail()) { 00066 std::wcerr<<L"PHONETIC_DISTANCE: Error opening file "+fname; 00067 exit(1); 00068 } 00069 00070 fcount = 0; 00071 00072 while(!is.eof()){ 00073 00074 is >> s; 00075 00076 if( s[0] == L'#'){ 00077 getline(is,s); 00078 00079 } else if( s==L"FON:") { 00080 is >> c; // this is the phoneme 00081 //cerr << "FONEMA "<< c << endl; 00082 getline(is,s); 00083 std::wstringstream ss(s,std::stringstream::in); 00084 i = 0; 00085 while(ss>>s){ 00086 if(s==L"+"){ 00087 features[(int)c][i] = 100; 00088 }else if(s==L"-"){ 00089 features[(int)c][i] = 0; 00090 }else{ // is a multivaluated feature 00091 features[(int)c][i] = values[s]; 00092 } 00093 //cerr << "Posant " << features[c][i] << " a " << i << " (" << s << ")"<< endl; 00094 i++; 00095 } 00096 00097 } else if( s==L"VALUE:") { 00098 is >> s >> t; // feature value is i 00099 values[s] = t; 00100 //cerr << "VALUE ADD: " << s << " <-- " << i << endl; 00101 00102 } else if( s==L"WEIGHT:") { 00103 is >> s >> t; // feature s weights i 00104 fweight[s] = t; 00105 00106 } else if( s==L"CONSTANT:") { 00107 is >> s >> t; // s takes value i 00108 if (s==L"Cskip") { cskip = t;} 00109 else if(s==L"Csub"){ csub = t;} 00110 else if(s==L"Cexp"){ cexp = t;} 00111 else if(s==L"Cvowel"){ cvowel = t;} 00112 else if(s==L"Cspace"){ cspace = t;} 00113 else{ std::wcerr << L"UNEXPECTED CONSTANT DEFINITION" << s << std::endl; } 00114 00115 } else if( s==L"VOWELS:") { 00116 //create a list with the vocalic phonemes 00117 getline(is,s); 00118 std::wstringstream ss(s, std::wstringstream::in); 00119 while( ss>>c ){ svowels.insert(c); } 00120 00121 } else if( s==L"CONSONANTS:") { 00122 //create a set with the consonantic phonemes 00123 getline(is,s); 00124 std::wstringstream ss(s, std::wstringstream::in); 00125 while( ss>>c ){ sconsonants.insert(c); } 00126 00127 } else if( s==L"FEATURES:") { 00128 //create a list with the index inside the matrix for each feature 00129 getline(is,s); 00130 std::wstringstream ss(s, std::wstringstream::in); 00131 i = 0; 00132 while( ss>>s ){ flist[s]=i; i++; } 00133 00134 } else if( s==L"FVOWELS:") { 00135 //create a set with 00136 getline(is,s); 00137 std::wstringstream ss(s, std::wstringstream::in); 00138 while( ss>>s ){ svfeatures.insert(s); } 00139 00140 } else if( s==L"FOTHER:") { 00141 //create a set with 00142 getline(is,s); 00143 std::wstringstream ss(s, std::wstringstream::in); 00144 while( ss>>s ){ scfeatures.insert(s); } 00145 00146 } else { 00147 //skip 00148 } 00149 00150 } 00151 00152 is.close(); 00153 00154 00155 /************************************************************** 00156 * 00157 * BUILD MATRIX OF DISTANCES 00158 * 00159 **************************************************************/ 00160 /* 00161 */ 00162 00163 std::set<wchar_t>::iterator it1; 00164 std::set<wchar_t>::iterator it2; 00165 std::set<std::wstring>::iterator it3; 00166 T d; 00167 int f; 00168 00169 for(int i=0;i<ALPHSIZE;i++){ 00170 for(int j=0;j<ALPHSIZE;j++){ 00171 distance[i][j]= i==j ? 0 : (T)8000; 00172 } 00173 } 00174 00175 //Build vowels vs vowels 00176 00177 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){ 00178 for( it2 = svowels.begin(); it2!=it1; ++it2){ 00179 //calculate distance between it1 and it2 using features in it3 00180 d=0; 00181 for(it3 = svfeatures.begin(); it3!=svfeatures.end(); ++it3){ 00182 f = flist[(*it3)]; 00183 d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)]; 00184 } 00185 distance[(int)(*it1)][(int)(*it2)] = d; 00186 distance[(int)(*it2)][(int)(*it1)] = d; 00187 } 00188 } 00189 00190 00191 //Build vowels vs consonants 00192 for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){ 00193 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){ 00194 //calculate distance between it1 and it2 using features in it3 00195 d=0; 00196 for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){ 00197 f = flist[(*it3)]; 00198 d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)]; 00199 } 00200 distance[(int)(*it1)][(int)(*it2)] = d; 00201 distance[(int)(*it2)][(int)(*it1)] = d; 00202 } 00203 } 00204 00205 00206 //Build consonants vs consonants 00207 for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){ 00208 for( it2 = sconsonants.begin(); it2!=it1; ++it2){ 00209 //calculate distance between it1 and it2 using features in it3 00210 d=0; 00211 for(it3 = scfeatures.begin(); it3!=scfeatures.end(); ++it3){ 00212 f = flist[(*it3)]; 00213 d += abs( features[(int)(*it1)][(int)f] - features[(int)(*it2)][(int)f] ) * fweight[(*it3)]; 00214 } 00215 distance[(int)(*it1)][(int)(*it2)] = d; 00216 distance[(int)(*it2)][(int)(*it1)] = d; 00217 } 00218 } 00219 00220 if(debug>2){ 00221 std::wcerr << L"\t"; 00222 for( int i=85; i<ALPHSIZE; i++ ){ 00223 std::wcerr << (wchar_t)i << L"\t"; 00224 } 00225 std::wcerr << std::endl; 00226 00227 for( int i=85; i<ALPHSIZE; i++ ){ 00228 std::wcerr << (wchar_t)i << L"\t"; 00229 for( int j=85; j<ALPHSIZE; j++ ){ 00230 std::wcerr << distance[i][j] << L"\t"; 00231 } 00232 std::wcerr << std::endl; 00233 } 00234 00235 } 00236 00237 00238 } //constructor 00239 00240 00241 void show(std::wostream &o){ 00242 00243 std::set<wchar_t>::iterator it1; 00244 std::set<wchar_t>::iterator it2; 00245 std::set<std::wstring>::iterator it3; 00246 00247 o << L"Distances between phonemes" << std::endl << L"==========================" << std::endl << std::endl; 00248 00249 o << L"Read values: cskip:" << cskip << L", csub:" << csub << L", cexp:" << cexp << L", cvowel:" << cvowel << std::endl; 00250 00251 00252 o << L"\t"; 00253 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << L"\t"; 00254 o << std::endl; 00255 00256 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){ 00257 o << (*it1) << L"\t"; 00258 for( it2 = svowels.begin(); it2!=it1; ++it2){ 00259 o << distance[(int)(*it1)][(int)(*it2)] << L"\t"; 00260 } 00261 o << std::endl; 00262 } 00263 00264 o << std::endl << L"\t"; 00265 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1) o << (*it1) << L"\t"; 00266 o << std::endl; 00267 00268 // vowels vs consonants 00269 for( it2 = sconsonants.begin(); it2!=sconsonants.end(); ++it2){ 00270 o << (*it2) << L"\t"; 00271 for( it1 = svowels.begin(); it1!=svowels.end(); ++it1){ 00272 o << distance[(int)(*it1)][(int)(*it2)] << L"\t"; 00273 } 00274 o << std::endl; 00275 } 00276 00277 o << std::endl << L"\t"; 00278 for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1) o << (*it1) << L"\t"; 00279 o << std::endl; 00280 00281 // consonants vs consonants 00282 for( it1 = sconsonants.begin(); it1!=sconsonants.end(); ++it1){ 00283 o << (*it1) << L"\t"; 00284 for( it2 = sconsonants.begin(); it2!=it1; ++it2){ 00285 o << distance[(int)(*it1)][(int)(*it2)] << L"\t"; 00286 } 00287 o << std::endl; 00288 } 00289 } 00290 00291 00292 T getCskip(){ 00293 return cskip; 00294 } 00295 00296 T dSkip(int c){ 00297 return c==L' ' || c==L'_' ? cskip+cspace : cskip; 00298 //return cskip; 00299 } 00300 00301 T dSub(int const a, int const b){ 00302 if( ( (wchar_t)a==L' ' || (wchar_t)a==L'_' ) && ( (wchar_t)b==L' ' || (wchar_t)b==L'_' ) ){ return cspace; } 00303 return (wchar_t)a==L'_' || (wchar_t)a==L' ' || (wchar_t)b==L' ' || (wchar_t)b==L'_' ? -cspace/2 : csub - distance[a][b] - V(a) - V(b); 00304 } 00305 00306 T dExp(int const a, int const b, int const c){ 00307 return cexp - distance[a][b] - distance[a][c] - V(a) - std::max(V(b),V(c)); 00308 } 00309 00310 }; 00311 00312 00313 00314 #endif
1.7.6.1