Detecting page orientation (C++)

I have a bunch of images that I would like to rotate based on page orientation. So I search for possible solutions within tesseract&leptonica...

For testing, I used image lyra.005.jpg available in leptonica repository.

API Example

I started with the Orientation and script detection (OSD) example provided by tesseract documentation and a simple speed measuring:

/*
  tessAPI.cpp
MSVC build:
  cl /EHsc tessAPI.cpp /I C:/win64/include /link /LIBPATH:C:\win64\lib leptonica-1.84.0.lib tesseract53.lib
*/

#include <chrono>

#include <leptonica/allheaders.h>
#include <tesseract/baseapi.h>

void test_osd(tesseract::Orientation *orientation,
              tesseract::WritingDirection *writing_direction,
              tesseract::TextlineOrder *textline_order, float *deskew_angle,
              const char *lang) {

  const char *inputfile = "lyra.005.jpg";
  PIX *image = pixRead(inputfile);
  tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();

  api->Init(NULL, lang);
  api->SetVariable("debug_file", "/dev/null");
  api->SetPageSegMode(tesseract::PSM_AUTO_OSD);
  api->SetImage(image);
  api->Recognize(0);
  tesseract::PageIterator *it = api->AnalyseLayout();
  it->Orientation(orientation, writing_direction, textline_order, deskew_angle);
  api->End();
  pixDestroy(&image);
}

int main(int argc, char *argv[]) {
  auto repeats = 20;
  auto lang = "eng";
  tesseract::Orientation orientation;
  tesseract::WritingDirection direction;
  tesseract::TextlineOrder order;
  float deskew_angle;

  auto t1 = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < repeats; ++i) {
    test_osd(&orientation, &direction, &order, &deskew_angle, lang);
  }
  auto t2 = std::chrono::high_resolution_clock::now();
  printf(
      "\ntest_osd (%s) function tooks in average %lli milliseconds (%d "
      "repeats)\n",
      lang,
      std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() /
          repeats,
      repeats);
  printf("Orientation: %d;\nWritingDirection: %d\nTextlineOrder: %d\n"
         "Deskew angle: %.4f\n",
         orientation, direction, order, deskew_angle);
  return 0;
}

Remark: You have to use traineddata, which contains the tesseract legacy model (e.g. from https://github.com/tesseract-ocr/tessdata)!

Here is my result for eng traineddata:

test_osd (eng) function tooks in average 987 milliseconds (20 repeats)
Orientation: 3;
WritingDirection: 0
TextlineOrder: 2
Deskew angle: 0.0000

Here is the result for osd traineddata:

test_osd (osd) function tooks in average 1508 milliseconds (20 repeats)
Orientation: 3;
WritingDirection: 0
TextlineOrder: 2
Deskew angle: 0.0000

The results are quite surprising to me, as I would expect osd model to be faster (it should be used only for orientation and script detection). According to the documentation of AnalyseLayout could be run without recognition (as I am interested in orientation detection). Commenting respective line in code gives significant speed improvement:

test_osd (eng) function tooks in average 433 milliseconds (20 repeats)
Orientation: 3;
WritingDirection: 0
TextlineOrder: 2
Deskew angle: 0.0000

test_osd (osd) function tooks in average 428 milliseconds (20 repeats)
Orientation: 3;
WritingDirection: 0
TextlineOrder: 2
Deskew angle: 0.0000

In this case difference between end and osd is not significant (and osd is faster).

DetectOrientationScript

There is also another way how to get information about orientation - DetectOrientationScript.

/*
  tessDOS.cpp
MSVC build:
  cl /EHsc tessDOS.cpp /I C:/win64/include /link /LIBPATH:C:\win64\lib leptonica-1.84.0.lib tesseract53.lib
*/

#include <chrono>

#include <leptonica/allheaders.h>
#include <tesseract/baseapi.h>

void test_dos(int *orient_deg, float *orient_conf, const char **script_name,
              float *script_conf, const char *lang) {

  const char *inputfile = "lyra.005.jpg";
  PIX *image = pixRead(inputfile);
  tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();

  api->Init(NULL, lang);
  api->SetVariable("debug_file", "/dev/null");
  api->SetImage(image);
  api->DetectOrientationScript(orient_deg, orient_conf, script_name, script_conf);
  api->End();
  pixDestroy(&image);
}

int main(int argc, char *argv[]) {
  auto repeats = 20;
  auto lang = "eng";
  int orient_deg;
  float orient_conf, script_conf;
  const char *script_name;

  auto t1 = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < repeats; ++i) {
    test_dos(&orient_deg, &orient_conf, &script_name, &script_conf, lang);
  }
  auto t2 = std::chrono::high_resolution_clock::now();
  printf(
      "\ntest_dos (%s) function tooks in average %lli milliseconds (%d "
      "repeats)\n",
      lang,
      std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() /
          repeats,
      repeats);
  printf("degree: %d  confidence: %f\n", orient_deg, orient_conf);
  printf("script: %s  confidence: %f\n", script_name, script_conf);
  return 0;
}

Here is my result for eng traineddata:

test_dos (eng) function tooks in average 248 milliseconds (20 repeats)
degree: 270  confidence: 5.029739
script: Latin  confidence: 28.148153

A faster solution seems to be using osd traineddata:

test_dos (osd) function tooks in average 211 milliseconds (20 repeats)
degree: 270  confidence: 2.614990
script: Cyrillic  confidence: 0.952381

This is faster 100% than using AnalyseLayout(). Is it possible to make it faster? Let's look at leptonica.

pixOrientDetect

pixOrientDetect requires thresholded deskewed image (black and white) with English text and 150 - 300 ppi. Here is the example code:

/*
  testLept.cpp
MSVC build:
  cl /EHsc testLept.cpp /I C:/win64/include /link /LIBPATH:C:\win64\lib leptonica-1.84.0.lib
*/

#include <chrono>

#include <leptonica/allheaders.h>

void test_lept(int *orientation) {

  l_int32 orient, tresh = 90;
  l_float32 upconf1, leftconf1;
  const char *inputfile = "lyra.005.jpg";
  PIX *fpixs, *image = pixRead(inputfile);;

  fpixs = pixConvertTo1(image, tresh);
  pixOrientDetect(fpixs, &upconf1, &leftconf1, 0, 0);

  if ((upconf1 > 1) && abs(upconf1) > abs(leftconf1))
    *orientation = 0;
  if ((leftconf1 > 1) && abs(leftconf1) > abs(upconf1))
    *orientation = 90;
  if ((upconf1 < -1) && abs(upconf1) > abs(leftconf1))
    *orientation = 180;
  if ((leftconf1 < -1) && abs(leftconf1) > abs(upconf1))
    *orientation = 270;

  pixDestroy(&fpixs);
  pixDestroy(&image);
}

int main(int argc, char *argv[]) {
  auto repeats = 20;
  int orientation = 0;
  
  setMsgSeverity(0);
  auto t1 = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < repeats; ++i) {
    test_lept(&orientation);
  }
  auto t2 = std::chrono::high_resolution_clock::now();
  printf(
      "\ntest_lept function took in average %lli milliseconds (%d "
      "repeats)\n",
      std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() /
          repeats,
      repeats);
  printf("Orientation: %d\n", orientation);
  return 0;
}

Here is my result:

test_lept function took in average 9 milliseconds (20 repeats)
Orientation: 90

The speed is impressive! But requirements (English text, deskewed binarized image) could be a limitation for some cases...

ramblings