By Ralph
Friday November 6, 2009
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <vector>
using namespace std;
// Make know words map
map<string,int> get_words(istream &input){
map<string,int> words;
string word;
// Go through big file collecting words
while(!input.eof()){
input >> word;
// Make sure string is a word
int len = word.length();
bool isWord = true;
for( int x = 0; x < len; x++){
word[x]=tolower(word[x]);
if(!isalpha(word[x]))
isWord = false;
}
// Put words into map that keeps track
// of count
if(isWord)
words[word]++;
}
return words;
}
// Make words of distance one
vector<string> edits1(string word){
vector<string> words;
int n = word.length();
// Deletions
for(int x = 0;x < n;x++){
string temp = word;
temp.erase(temp.begin() + x);
words.push_back(temp);
}
// Transpositions
for(int x = 0; x < n-1; x++){
string temp = word;
char c;
c = temp[x];
temp[x] = temp[x+1];
temp[x+1] = c;
words.push_back(temp);
}
// Alterations
string alphabet = "abcdefghijklmnopqrstuvwxyz";
for(int x = 0; x < n; x++){
for(int y = 0; y < 26; y++){
string temp = word;
temp[x] = alphabet[y];
words.push_back(temp);
}
}
// Insertions
for(int x=0;x<=n;x++){
for(int y=0;y<26;y++){
string temp = word;
temp.insert(x,1,alphabet[y]);
words.push_back(temp);
}
}
return words;
}
// Make known words of distance 2
vector<string> known_edits2(string word, map<string,int> known_words){
vector<string> words,words2,temp;
// Get words of distance 1
words = edits1(word);
int count = words.size();
// Get words of distance 2
for(int x=0;x<count;x++){
temp = edits1(words[x]);
int count = temp.size();
// Insert known words
for(int x=0;x<count;x++)
if(known_words[(temp[x])])
words2.push_back(temp[x]);
}
return words2;
}
// Find known words with input as vector
vector<string> known(vector<string> temp,map<string,int> known_words){
vector<string> words;
int count = temp.size();
// Push back known words
for(int x=0;x<count;x++){
if(known_words[(temp[x])])
words.push_back(temp[x]);
}
return words;
}
// Find know words with input as string
vector<string> known(string temp,map<string,int> known_words){
vector<string> words;
// is the word known?
if(known_words[temp])
words.push_back(temp);
return words;
}
// Find the correct word
string correct(string word, map<string,int> known_words){
vector<string> candidates;
// Is the word known?
candidates = known(word,known_words);
// If not, get words from distance 1
if(candidates.size() == 0)
candidates = known(edits1(word),known_words);
// If nothing from distance 1 is a word,
// get words from distance 2
if(candidates.size() == 0)
candidates = known_edits2(word,known_words);
// If nothing return word
if(candidates.size() == 0)
return word;
int count = candidates.size();
int max=0,temp=0;
string candidate;
// Get word with the highest usage from candidates
for(int x=0;x<count;x++){
temp = known_words[(candidates[x])];
if(temp > max){
max = temp;
candidate = candidates[x];
}
}
return candidate;
}
int main(int argc, char* argv[]){
if(argc != 2){
cerr << "Usage: " << argv[0] << "string" << endl;
return -1;
}
// Get word from user input
string word = argv[1];
// File that has huge collection of words
ifstream file("big.txt");
// Create map of words
map<string,int> known_words = get_words(file);
// Get correct word
string correction = correct(word, known_words);
cout << "correct( " << word << " ) == " << correction << endl;
return 0;
}
Related posts: