Software‎ > ‎

Optical Character Recognition

Computers can receive information through a myriad of input devices, for example via a keyboard, mouse, camera, or microphone. It is common for a computer to do no processing on the input it receives and in case of storage, to store all of the bits from the input devices verbatim. Though this has its benefits, sometimes it is necessary to have a more fundamental understanding of the input data. One example of this is to have some text inputted through a camera or a scanner in the form of an image, where it may be useful to have a textual representation as well.

The process by which one can do this is known as optical character recognition, or more commonly OCR. In this project, I am going to describe how implement one of these.

First let us take a look at how we could generate a simple database from the various samples we may have initially. Either we would need to write all the rules for what an ‘A’ could look like, or we could simply show the system what ‘A’s look like; the latter is much simpler to do, and it is a prerequisite, to have a set of all possible letters visually represented. In this case I will assume that the first letter of the filename represents the character, and the extension would be .bmp. For example, A123.bmp would be one image of an ‘A’. 

We first will take a bitmap of a character, with arbitrary whitespace around it, and generate a 20x20 array.
int** segment_aux_generate_20x20(BMP * im1)
{
      RGBApixel black_pix;
      black_pix.Alpha = 0, black_pix.Red = 0, black_pix.Green = 0, black_pix.Blue = 0;

      RGBApixel white_pix;
      white_pix.Alpha = 0, white_pix.Red = 255, white_pix.Green = 255, white_pix.Blue = 255;

      int width = (*im1).TellWidth();
      int height = (*im1).TellHeight();
     
      // Copy the image to a temporary array for easy processing.
      int** seg1;
      seg1 = new int*[height];
      for (int i = 0; i < height; i++) {
            seg1[i] = new int[width];
      }

      for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++) {
                  if ( is_color_similar ( (*(im1))(j, i), &white_pix, 1) ) {
                        seg1[i][j] = 0;
                  }
                  else {
                        seg1[i][j] = 1;
                  }
            }
      }
      // Here, we detect the boundaries of the letter.
      //    y_i
      //    -----
      //x_i|   |x_f
      //    -----
      //    y_f

      int x_i, y_i, x_f, y_f;
      x_i =  y_i = x_f = y_f = -1;
      for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++) {
                  if (seg1[i][j] == 1) {
                        y_i = i;
                        i = height; j = width;
                  }
            }
      }
     
      for (int i = height-1; i > 0; i--) {
            for (int j = 0; j < width; j++) {
                  if (seg1[i][j] == 1) {
                        y_f = i;
                        i = 0; j = width;
                  }
            }
      }
     
      for (int i = 0; i < width; i++) {
            for (int j = 0; j < height; j++) {
                  if (seg1[j][i] == 1) {
                        x_i = i;
                        i = width; j = height;
                  }
            }
      }
     
      for (int i = width-1; i > 0; i--) {
            for (int j = 0; j < height; j++) {
                  if (seg1[j][i] == 1) {
                        x_f = i;
                        i = 0; j = height;
                  }
            }
      }

      if (x_i == -1 || y_i == -1 || x_f == -1 || y_f == -1) {
            for (int i = 0; i < height; i++) {
                  delete[] seg1[i];
            }

            return NULL;
      }
      // The resizing happens here.
      int** _data;
      _data = new int*[x_f - x_i + 1];
      for (int i = 0; i < x_f - x_i + 1; i++) {
            _data[i] = new int[y_f - y_i + 1];
      }
      for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++) {
                  if (j >= x_i && j <= x_f) {
                        if (i >= y_i && i <= y_f) {
                              if (seg1[i][j] == 1) {
                                    _data[j - x_i][i - y_i] = 1;
                              }
                              else {
                                    _data[j - x_i][i - y_i] = 0;
                              }
                        }
                  }
            }
      }

      int** segment_1_arr;
      segment_1_arr = resample(_data, x_f - x_i + 1, y_f - y_i + 1);

      // Clear the seg1 array
      for (int i = 0; i < height; i++) {
            delete[] seg1[i];
      }
      for (int i = 0; i < x_f - x_i + 1; i++) {
            delete[] _data[i];
      }

      return segment_1_arr;

}

The sub algorithm to resample is given below:
int** resample(int** _data, int _width, int _height)
{
      // The new dimensions are 20x20.
      int final_x = 20, final_y = 20;

      if(_data == NULL)
            return NULL;

      // The new buffer
      int** buffer;
      buffer = new int*[final_y];
      for (int i = 0; i < final_y; i++) {
            buffer[i] = new int[final_x];
      }
   
      // Get the ratio of final dimension to original dimension
      double scale_x =  (double)final_x / (double)_width;
      double scale_y = (double)final_y / (double)_height;

      for(int cy = 0; cy < final_y; cy++) {
            for(int cx = 0; cx < final_x; cx++) {
                  int nearestMatch_y = (int)(cy / scale_y);
                  int nearestMatch_x = (int)(cx / scale_x);

                  buffer[cy][cx] =  _data[nearestMatch_x][nearestMatch_y];
            }
      }
     
    return buffer;
}


So now that we have an internal 20x20 array representation of a particular bitmap of a letter, we could write the data into our organized database file.
bool global_load_database() {

      struct _finddata_t file_info;
     
      // Get the current working directory, we will look for *.bmp in a subfolder \db\.
      char dir[256];
      getcwd( dir, 256 );
     
      char db[256] = "\\db\\";
      char old_path[256] = "";
      char temp_file[256];

      intptr_t handle = 0;

      memset(&file_info,0,sizeof(file_info));

      strcpy(old_path,dir);
      strcat(old_path,db);
      strcat(old_path,"*.bmp");

      // Count the number of .bmp files into GLOBAL_TEXT_DATABASE_COUNTER
      handle = _findfirst(old_path,&file_info);
      if (handle != -1)
      {
        do
        {
                  GLOBAL_TEXT_DATABASE_COUNTER++;                
            }          
            while(_findnext(handle,&file_info) != -1);           
      }
    _findclose(handle);

      if (GLOBAL_TEXT_DATABASE_COUNTER == 0) {
            return false;
      }

      // Now initialize the actual representation of the characters and corresponding characters
      GLOBAL_TEXT_DATABASE = new int**[GLOBAL_TEXT_DATABASE_COUNTER];
      GLOBAL_TEXT_DATABASE_LETTERS = new char[GLOBAL_TEXT_DATABASE_COUNTER];

      if(GLOBAL_TEXT_DATABASE == NULL || GLOBAL_TEXT_DATABASE_LETTERS == NULL) {
            return false;
      }

      unsigned long num_files_counter = 0;
      handle = _findfirst(old_path,&file_info);
      if (handle != -1)
      {
        do
        {
                  // This will be the full directory and name of the current bitmap.
                  char *new_name = NULL;
                  new_name = strdup(file_info.name);

                  for(int i = 0; i < 256; i++) {
                        temp_file[i] = '\0';
                  }

                  strcat(temp_file, dir);
                  strcat(temp_file, db);
                  strcat(temp_file, new_name);

                  // Load the bitmap.
                  BMP * im1;
                  im1 = new BMP;
                  (*im1).ReadFromFile(temp_file);

                  // Generate the data using the
auxiliary functions and add to the database.
                  GLOBAL_TEXT_DATABASE[num_files_counter] = segment_aux_generate_20x20(im1);
                  GLOBAL_TEXT_DATABASE_LETTERS[num_files_counter] = new_name[0];

                  num_files_counter++;

            delete new_name;
                  free(temp_file);
                  (*im1).~BMP();
            }

            while(_findnext(handle,&file_info) != -1);
      }

      // Write it from memory into storage.
      if(global_write_to_dat())
            return true;

      return false;
}


In my implementation I have just a simple binary file which stores relevant information for the database. The binary file is separated into three parts, the first 32 bits represent an unsigned long which is the number of entries we have. The next n * 8 bits each represent a character, where n is the numerical value of the unsigned long we read earlier. The next n * 32 * 20 * 20 bits represent the actual data for each letter; N is the number of letters, and each letter is stored in a 20x20 grid, and each element of this grid is a 32 bit int.
bool global_write_to_dat()
{
      struct _finddata_t file_info;

      // Get the current working directory. The database, just a binary db.dat file, will be stored in
      // a subdirectory \db\.
      char dir[256];
      getcwd( dir, 256 );

      char db[256] = "\\db\\";
      char old_path[256] = "";
      char temp_file[256];

      intptr_t handle = 0;

      memset(&file_info,0,sizeof(file_info));

      strcpy(old_path,dir);
      strcat(old_path,db);
      strcat(old_path,"db.dat");

      // Remove old db if it exists.
      remove(old_path);

      // Create a new db
      fstream indb(old_path, ios::out | ios::binary);

      // The dimensions for the OCR character data is 20x20.
      int array_dim = 20;

      // Write the amount of elements in the database array
      indb.write(reinterpret_cast<char*> (&GLOBAL_TEXT_DATABASE_COUNTER), sizeof (unsigned long) );

      // Write the character index for the database array
      indb.write( (GLOBAL_TEXT_DATABASE_LETTERS), GLOBAL_TEXT_DATABASE_COUNTER * sizeof (char) );

      // Write the database array
      for(unsigned long i = 0; i < GLOBAL_TEXT_DATABASE_COUNTER; i++) {
            for (int j = 0; j < array_dim; j++) {
                  indb.write( reinterpret_cast<char*> (GLOBAL_TEXT_DATABASE[i][j]), array_dim * sizeof(int) );
            }
      }

      indb.close();
      free(old_path);

      // You can include some additional error checking here if you want.
      return true;

}


The corresponding reading section is given:
bool global_read_from_dat()
{
      struct _finddata_t file_info;
     
      char dir[256];
      getcwd( dir, 256 );
     
      char db[256] = "\\db\\";
      char old_path[256] = "";
      char temp_file[256];

      intptr_t handle = 0;

      memset(&file_info,0,sizeof(file_info));

      strcpy(old_path,dir);
      strcat(old_path,db);
      strcat(old_path,"db.dat");

      // Is the db found?
      handle = _findfirst(old_path,&file_info);
      if (handle == -1)
      {
            _findclose(handle);
        return false;
      }
     
      fstream indb(old_path, ios::in | ios::binary);

      int array_dim = 20;

      // Load the size of the database array
      indb.read(reinterpret_cast<char*> (&GLOBAL_TEXT_DATABASE_COUNTER), sizeof (unsigned long) );

      // Create the character index array, and load it
      GLOBAL_TEXT_DATABASE_LETTERS = new char[GLOBAL_TEXT_DATABASE_COUNTER];
      indb.read( (GLOBAL_TEXT_DATABASE_LETTERS), GLOBAL_TEXT_DATABASE_COUNTER * sizeof (char) );
     
      // Create the database array, and load it
      GLOBAL_TEXT_DATABASE = new int**[GLOBAL_TEXT_DATABASE_COUNTER];
      for(unsigned long i = 0; i < GLOBAL_TEXT_DATABASE_COUNTER; i++) {
            GLOBAL_TEXT_DATABASE[i] = new int*[array_dim];
            for (int j = 0; j < array_dim; j++) {
                  GLOBAL_TEXT_DATABASE[i][j] = new int[array_dim];
                  indb.read(reinterpret_cast<char*> (GLOBAL_TEXT_DATABASE[i][j]), array_dim * sizeof(int) );
            }
      }

      indb.close();
      free(old_path);

      return true;
}

Here is a simple matching algorithm. Essentially we just match the current input, which is a 20x20 array of character data, with all the letters of the database. The running time is proportional to the size of the database.
// A struct to keep track of possible matches and their probability.
struct letter_probability {
      char letter;
      int probability;
};

// Custom
comparator to sort by highest probability.
bool letter_probability_sort(letter_probability i, letter_probability j)
{
      if (i.probability > j.probability )
            return true;

      return false;
}

char* segment_aux_match(int** letter_20x20)
{
      vector<letter_probability> letter_prob_vector;

      char* result;
      result = new char[2];

      for (unsigned long k = 0; k < GLOBAL_TEXT_DATABASE_COUNTER; k++) {

            int match_counter = 0;

            for (int i = 0; i < 20; i++) {
                  for (int j = 0; j < 20; j++) {
                        if (GLOBAL_TEXT_DATABASE[k][i][j] == letter_20x20[i][j]) {
                              match_counter++;
                        }
                  }
            }

            letter_probability a;
            a.letter = GLOBAL_TEXT_DATABASE_LETTERS[k];
            a.probability = match_counter;

            letter_prob_vector.push_back(a);
     
      }

      sort(letter_prob_vector.begin(), letter_prob_vector.end(), letter_probability_sort);

      // If you need to output information about the match, here is the letter and the matching
probability.
      // Sample output: cout << endl << endl << result[0] << " " << int(result[1]) << "%" << endl << endl;
      result[0] = (*(letter_prob_vector.begin())).letter;
      result[1] = int ( double ( 100* (*(letter_prob_vector.begin())).probability) / double((20*20)) );

      return result;

}


Conclusion: This implementation is just a basic framework of how an OCR system could work. You could improve and optimize the algorithms to make it faster and more accurate. For example, to increase speed it’s not necessary to match every character in the database; keep track of how many misses within each character data you could have, and do not process further if it is exceeded. To improve accuracy you could perform a type of normalization on the inputs before matching. However, this system works well enough for many purposes.

Original ideas by Prabhu, published 2011 at p.rabhu.com.