How to find without dictionaries a language of a text sample? It can be accomplished by comparing frequencies of letters in a sample and in known languages. For example in polish 'a' letter is about 0.0551146 of all letters, in french it's 0.049458 and in german 0.0434701.
I created a small program in C++ that takes as an argument path to file with unknown language and print, how this language differs from languages that he knowns. The lowest result is the best match. Frequencies of letters in known languages are computed from files in samples directory.
Below is the output for checked sample and this sample (in French):
bash-3.2$ ./a.out test3.txt difference between Polish language: 0.0965482 difference between French language: 0.0442431 difference between German language: 0.0945827
Au contraire de Józef Piłsudski, qui rêvait d'une grande fédération slave alliée à l'Allemagne contre la Russie, Roman Dmowski était désireux de fonder une Pologne alliée à la Russie : il a ainsi soutenu l'idée d'une association avec l'Empire russe, puis, malgré son anticommunisme, avec l'Union soviétique, qu'il considère malgré tout comme un prolongement de la politique des tsars. De même, bien qu'agnostique, Dmowski ne concevait l'existence d'un État polonais que sur une base catholique, et ethniquement homogène, ce qui impliquait pour les populations allogènes (Baltes, Ukrainiens, Allemands et mêmes Juifs) l'expulsion ou l'assimilation, rejetant ainsi le concept de République des Deux Nations, avancé par la gauche et les libéraux.
Below is the source of used program.
#include <iostream> #include <string> #include <algorithm> #include <fstream> #include <sstream> #include <vector> #include <map> #include <math.h> using namespace std; class Language { public: Language (vector<string> filenames) { for (vector<string>::iterator i = filenames.begin(); i != filenames.end(); i++) { // read whole file into string ifstream t(i->c_str()); stringstream buffer; buffer << t.rdbuf(); string data = buffer.str(); // make content of file lower case transform(data.begin(), data.end(), data.begin(), ::tolower); sample += data; } computeLettersDistribution(); } map<char, double> getLettersDistribution() { return lettersDistribution; } double compare(Language a) { double error = 0.0; for (map<char, double>::iterator i = lettersDistribution.begin(); i != lettersDistribution.end(); i++) { error += pow((*i).second - a.getLettersDistribution()[(*i).first], 2); } return pow(error, 0.5); } protected: void computeLettersDistribution() { for (char c = 'a'; c <= 'z'; c++) { lettersDistribution[c] = 0; } for (string::iterator c = sample.begin(); c != sample.end(); c++) { // over letters in sample if (*c >= 'a' && *c <= 'z') { lettersDistribution[*c]++; } } for (map<char, double>::iterator d = lettersDistribution.begin(); d!= lettersDistribution.end(); d++) { lettersDistribution[ (*d).first ] = (*d).second / sample.length(); } } string sample; map<char, double> lettersDistribution; // key is a letter, value is quantity of this letter in file }; int main(int argc, char* argv[]) { if (2 != argc) { cout << "usage: " << argv[0] << " filename" <<endl; return EXIT_SUCCESS; } vector<string> samplesPL; // vector of filenames samplesPL.push_back("samples/polski_1.txt"); samplesPL.push_back("samples/polski_2.txt"); Language polish(samplesPL); vector<string> samplesFR; // vector of filenames samplesFR.push_back("samples/francais_1.txt"); samplesFR.push_back("samples/francais_2.txt"); samplesFR.push_back("samples/francais_3.txt"); Language francais(samplesFR); vector<string> samplesDE; // vector of filenames samplesDE.push_back("samples/deutsch_1.txt"); samplesDE.push_back("samples/deutsch_2.txt"); Language deutsch(samplesDE); vector<string> investigatedSamples; // vector of filenames investigatedSamples.push_back(argv[1]); Language investigated(investigatedSamples); cout << "difference between Polish language: " << investigated.compare(polish) << endl; cout << "difference between French language: " << investigated.compare(francais) << endl; cout << "difference between German language: " << investigated.compare(deutsch) << endl; }
0 commentaires:
Post a Comment