#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include <vector>
using namespace std;

// Make know words map
map<string,int> get_words(istream &input){
  map<string,int> words;
  string word;
  // Go through big file collecting words
  while(!input.eof()){
    input >> word;
    // Make sure string is a word
    int len = word.length();
    bool isWord = true;
    for( int x = 0; x < len; x++){
      word[x]=tolower(word[x]);
      if(!isalpha(word[x]))
        isWord = false;
    }
    // Put words into map that keeps track
    // of count
    if(isWord)
      words[word]++;
  }
  return words;
}

// Make words of distance one
vector<string> edits1(string word){
  vector<string> words;
  int n = word.length();
  // Deletions
  for(int x = 0;x < n;x++){
    string temp = word;
    temp.erase(temp.begin() + x);
    words.push_back(temp);
  }

  // Transpositions
  for(int x = 0; x < n-1; x++){
    string temp = word;
    char c;
    c = temp[x];
    temp[x] = temp[x+1];
    temp[x+1] = c;
    words.push_back(temp);
  }  

  // Alterations
  string alphabet = "abcdefghijklmnopqrstuvwxyz";
  for(int x = 0; x < n; x++){
    for(int y = 0; y < 26; y++){
      string temp = word;
      temp[x] = alphabet[y];
      words.push_back(temp);
    }
  }

  // Insertions
  for(int x=0;x<=n;x++){
		for(int y=0;y<26;y++){
			string temp = word;
			temp.insert(x,1,alphabet[y]);
			words.push_back(temp);
		}
	}
 
	return words;
 
}
 
// Make known words of distance 2
vector<string> known_edits2(string word, map<string,int> known_words){
	vector<string> words,words2,temp;
 
	// Get words of distance 1
	words = edits1(word);
 
	int count = words.size();
 
	// Get words of distance 2
	for(int x=0;x<count;x++){
		temp = edits1(words[x]);
 
		int count = temp.size();
 
		// Insert known words
		for(int x=0;x<count;x++)
			if(known_words[(temp[x])])
				words2.push_back(temp[x]);
 
	}
 
	return words2;
}
 
// Find known words with input as vector
vector<string> known(vector<string> temp,map<string,int> known_words){
	vector<string> words;
 
	int count = temp.size();
 
	// Push back known words
	for(int x=0;x<count;x++){
		if(known_words[(temp[x])])
			words.push_back(temp[x]);
	}
 
	return words;
}
 
// Find know words with input as string
vector<string> known(string temp,map<string,int> known_words){
	vector<string> words;
 
	// is the word known?
	if(known_words[temp])
		words.push_back(temp);
 
	return words;
}
 
// Find the correct word
string correct(string word, map<string,int> known_words){
	vector<string> candidates;
 
	// Is the word known?
	candidates = known(word,known_words);
 
	// If not, get words from distance 1
	if(candidates.size() == 0)
		candidates = known(edits1(word),known_words);
 
	// If nothing from distance 1 is a word,
	// get words from distance 2
	if(candidates.size() == 0)
		candidates = known_edits2(word,known_words);
 
	// If nothing return word
	if(candidates.size() == 0)
		return word;
 
	int count = candidates.size();
	int max=0,temp=0;
	string candidate;
 
	// Get word with the highest usage from candidates
	for(int x=0;x<count;x++){
		temp = known_words[(candidates[x])];
		if(temp > max){
			max = temp;
			candidate = candidates[x];
		}
	}
 
	return candidate;
 
}
 
int main(int argc, char* argv[]){
 
	if(argc != 2){
		cerr << "Usage: " << argv[0] << "string" << endl;
		return -1;
	}
 
	// Get word from user input
	string word = argv[1];
 
	// File that has huge collection of words
	ifstream file("big.txt");
 
	// Create map of words
	map<string,int> known_words = get_words(file);
 
	// Get correct word
	string correction = correct(word, known_words);
 
	cout << "correct( " << word << " ) == " << correction << endl;
 
	return 0;
}

Related posts:


blog comments powered by Disqus