/*----------------------------------------------------------------------
    Character / Text line recognition library   nhrec.cpp
      Rev. 090510
        Written by H.Goto, Jan. 2009
        Revised by H.Goto, Jan. 2009
        Revised by H.Goto, May  2009
----------------------------------------------------------------------*/

/*--------------
  Copyright 2008,2009  Hideaki Goto

  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
      http://www.apache.org/licenses/LICENSE-2.0
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
--------------*/


#define		N_Top		10

#include        <stdio.h>
#include        <stdlib.h>
#include        <string.h>
#include        <math.h>

#include	"utypes.h"

#include	"ufilep.h"
#include	"siplib.h"
#include	"imgobj.h"
#include	"objgrp.h"

#include	"discrim.h"
#include	"otsubin.h"

#include	"nhocr.h"
#include	"feature_PLOVE.h"

#include	"segchar_adhoc.h"	// FIXME


// default file/path names
static char	*nhocrlibdir0 = "/opt/nhocr/share";
static char	*cctablefile0 = "cctable.utf-8";
static char	*dicfile0 = "PLM.dic";


#define	debugprintf	if(NHrec::debug)printf




/*----------------------------------------------
    Constructor / Destructor
----------------------------------------------*/

NHrec :: NHrec(){
	n_cat = 0;
	n_top = N_Top;
	cclist = 0;
	nhocrlibdir = nhocrlibdir0;
	cctablefile = cctablefile0;
	dicfile = dicfile0;

	debug = 0;
}


NHrec :: ~NHrec(){
	close();
}




/*----------------------------------------------
    Set library directory / dictionary file
----------------------------------------------*/

int NHrec :: setlibdir(char *path){
	nhocrlibdir = path;
	return(0);
}


int NHrec :: setdicfile(char *dicname, char *cctablename){
	dicfile = dicname;
	cctablefile = cctablename;
	return(0);
}




/*----------------------------------------------
    Automatic image inverter
----------------------------------------------*/

int NHrec :: auto_invert(SIPImage *image){
	uchar	*p;
	int	x,y,fgcount = 0;
	int	width  = image->width;
	int	height = image->height;
	for ( y=0 ; y<height ; y++ ){
		p = (uchar *)image->pdata[y];
		for ( x=0 ; x<width ; x++ ){
			if ( p[x] < 128 )  fgcount++;
		}
	}
	if ( (double)fgcount / (double)(width * height) < .5 )  return(0);

	// invert image
	for ( y=0 ; y<height ; y++ ){
		p = (uchar *)image->pdata[y];
		for ( x=0 ; x<width ; x++ ){
			p[x] = 0xff - p[x];
		}
	}
	return(0);
}




/*----------------------------------------------
    Adaptive binarization
----------------------------------------------*/

int NHrec :: binarize_otsu(SIPImage *src, SIPImage *dst, \
	int csize, int interpolate){
	return( adpt_binarize_otsu(src,dst,csize,interpolate) );
}




/*----------------------------------------------
    Recognize text line image
----------------------------------------------*/

int NHrec :: open(){
	int	n;
	char	*fname;
	n = strlen(cctablefile);
	if ( (size_t)n < strlen(dicfile) ){
		n = strlen(dicfile);
	}
	n += strlen(nhocrlibdir) + 2;
	if ( 0 == (fname = new char[n]) ){
		fprintf(stderr,"libnhocr: Failed to access to library directory %s.\n",nhocrlibdir);
		return(1);
	}

	strcpy(fname,nhocrlibdir);
	strcat(fname,"/");
	strcat(fname,cctablefile);
	n_cat = load_codelist(fname,&cclist);
	if ( n_cat < 1 ){
		fprintf(stderr,"libnhocr: Failed to load Character code table.\n");
		fprintf(stderr,"libnhocr: Check NHOCR_DICDIR environment variable.\n");
		if ( cclist ){ delete []cclist;  cclist = 0; }
		delete []fname;
		return(1);
	}

#if 0
	CharCode	*cc;
	debugprintf("libnhocr: Found %d character codes.\n",n_cat);
	for ( n=0 ; n<n_cat ; n++ ){
		cc = &cclist[n];
		debugprintf("libnhocr: %d:\t%s\t%X\t\%04X\n", \
			n,cc->ccode,cc->poshint,cc->sizehint);
	}
#endif

	strcpy(fname,nhocrlibdir);
	strcat(fname,"/");
	strcat(fname,dicfile);
	n = ocrbase_loaddic(&Rec, fname, FVECDIM_PLM, n_top);
	if ( n<1 ){
		fprintf(stderr,"libnhocr: Failed to load character dictionary.\n");
		fprintf(stderr,"libnhocr: Check NHOCR_DICDIR environment variable.\n");
		if ( cclist ){ delete []cclist;  cclist = 0; }
		delete []fname;
		return(1);
	}
//	printf("Found %d character vectors.\n",n);
	if ( n_cat != n ){
		fprintf(stderr,"libnhocr: Mismatch in number of characters (%d vs %d).\n",n_cat,n);
		if ( cclist ){ delete []cclist;  cclist = 0; }
		delete []fname;
		return(1);
	}

	delete []fname;
	return(0);
}




int NHrec :: close(){
	Rec.dealloc();
	if ( cclist ){ delete []cclist;  cclist = 0; }
	return(0);
}




int NHrec :: rec_addstr(char *line, char *str, int bufsize){
	if ( strlen(line) + strlen(str) >= (size_t)bufsize )  return(-1);
	strcat(line,str);
	return( strlen(line) );
}




int NHrec :: rec_character(SIPImage *image, CharBox *cb){
	uchar	*p;
	int	cbw,cbh;
	int	n=0, cid;
	OCRPrep	OCRPrep;
	FeatureVector	vec(FVECDIM_PLM);
	SIPImage	*cimage;
	SIPImage	*cnorm;
	SIPRectangle	srect,drect;
	drect.x = drect.y = 0;

	if ( 0 == (cnorm = sip_CreateImage(64,64,8)) )  return(-1);

	cbw = cb->width();
	cbh = cb->height();
	srect.x = cb->xs;
	srect.y = cb->ys;
	srect.width = drect.width = cbw;
	srect.height = drect.height = cbh;
	cimage = sip_CreateImage(cbw,cbh,8);
	if ( cimage == 0 )  return(-2);
	sip_CopyArea(image,&srect,cimage,&drect);
	for ( int y=0 ; y<cbh ; y++ ){
		p = (uchar *)cimage->pdata[y];
		for ( int x=0 ; x<cbw ; x++ ){
			p[x] = ~p[x];
		}
	}
	OCRPrep.normalize(cimage,cnorm,2.0);
	feature_PLM(cnorm, &vec);
	Rec.recognizeEuclidean(vec,0);
	sip_DestroyImage(cimage);

#if 0
	printf("--------\n");
	for ( cid = -1, n=0 ; n<n_top ; n++ ){
		if ( Rec.resultTable[n].id >= n_cat \
		  || Rec.resultTable[n].id < 0 ){
			break;
		}
		printf("%s\t%04x %04x %04x %04x\n", \
			cclist[Rec.resultTable[n].id].ccode,
			cclist[Rec.resultTable[n].id].poshint,cb->poshint,\
			cclist[Rec.resultTable[n].id].sizehint,cb->sizehint);
	}
#endif

	for ( cid = -1, n=0 ; n<n_top ; n++ ){
		if ( Rec.resultTable[n].id >= n_cat \
		  || Rec.resultTable[n].id < 0 ){
			break;
		}
		if ( cb->ascii ){
			if ( ! cclist[Rec.resultTable[n].id].ascii ){
				continue;
			}
		}
		if ( cid < 0 )  cid = n;

		if ( (cclist[Rec.resultTable[n].id].poshint & cb->poshint) == 0 ){
			continue;
		}

		if ( cclist[Rec.resultTable[n].id].sizehint == SizeHint_None ){
			cid = n;  break;
		}
		if ( (cclist[Rec.resultTable[n].id].sizehint & cb->sizehint) != 0 ){
			cid = n;  break;
		}
	}

	sip_DestroyImage(cnorm);
	return(cid);
}




int NHrec :: rec_character(SIPImage *image, \
		int x0, int y0, int width, int height, \
		RecResultItem *resultTable){
	uchar	*p;
	OCRPrep	OCRPrep;
	FeatureVector	vec(FVECDIM_PLM);
	SIPImage	*cimage;
	SIPImage	*cnorm;
	SIPRectangle	srect,drect;
	drect.x = drect.y = 0;

	if ( 0 == (cnorm = sip_CreateImage(64,64,8)) )  return(-1);

	srect.x = x0;
	srect.y = y0;
	srect.width = drect.width = width;
	srect.height = drect.height = height;
	cimage = sip_CreateImage(width,height,8);
	if ( cimage == 0 )  return(-2);
	sip_CopyArea(image,&srect,cimage,&drect);
	for ( int y=0 ; y<height ; y++ ){
		p = (uchar *)cimage->pdata[y];
		for ( int x=0 ; x<width ; x++ ){
			p[x] = ~p[x];
		}
	}
	OCRPrep.normalize(cimage,cnorm,2.0);
	feature_PLM(cnorm, &vec);
	Rec.recognizeEuclidean(vec,0);
	sip_DestroyImage(cimage);
	sip_DestroyImage(cnorm);

	memcpy((void *)resultTable, (void *)(Rec.resultTable), \
		sizeof(RecResultItem) * n_top);

	// Return -1 if there is no matching character.
	return( Rec.resultTable[0].id );
}




int NHrec :: rec_line(SIPImage *image, char *resultline, int bufsize){
	int	cid;
	CharBox	*cba = 0;
	CharBox	*cba_raw;
	CharBox	*cb;
	CharBox	*cblist = 0;
	double	avrcwidth, lineheight, charpitch;
	int	xe0;
	int	i;
	RecResultItem	recResult_c;	// combined
	RecResultItem	recResult1;
	RecResultItem	recResult2;

	resultline[0] = '\0';

	if ( n_cat < 1 )  return(-1);

	if ( 0 == (cba = new CharBox[ 2 * (image->width +1) ]) )  return(-1);
	cba_raw = cba + image->width +1;

	segmentchars(image, cba, cba_raw, \
		&avrcwidth, &lineheight, &charpitch, NHrec::debug);

#if 0
	for ( i=0 ; cba[i].nbox != 0 ; i+=cba[i].nbox ){
		cb = cba[i];
		if ( cb->poshint & PosHint_Top ){
			printf("T");
		} else if ( cb->poshint & PosHint_Middle ){
			printf("M");
		} else if ( cb->poshint & PosHint_Bottom ){
			printf("B");
		}
	}
	printf("\n");
#endif

	xe0 = cba[0].xe;
	for ( i=0 ; cba[i].nbox != 0 ; i+=cba[i].nbox ){
		cb = &cba[i];

		// cblist does not contain an explicit SPACE character.
		// Spaces are detected and inserted by
		// measuring the distances between adjoining boxes.
//		if ( (int)(charpitch - avrcwidth) < (cb->xs - xe0) ){
//(Sep.26)	if ( lineheight * .7 < cb->xs - xe0 ){
		if ( (int)avrcwidth * .8 < cb->xs - xe0 ){
			if ( 0 > rec_addstr(resultline, " ", bufsize) ){
				break;
			}
		}
		xe0 = cb->xe;

		// Discard very long objects that do not look like text.
		if ( cb->width() / lineheight > 3 ){
			if ( 0 > rec_addstr(resultline, ".", bufsize) ){
				break;
			}
			continue;
		}

		// Perform character recognition.
		int	cid2 = 0;
		double	dist1=0;
		double	dist2=0;

		// (suppress compiler warning)
		recResult_c = recResult1 = recResult2 = Rec.resultTable[0];

		if ( cb->nbox == 2 ){
			cid = rec_character(image, &cba_raw[i]);
			if ( cid >= 0 ){
				recResult1 = Rec.resultTable[cid];
//				dist2 += recResult1.dist * recResult1.dist;
				dist2 += recResult1.dist;
			}
			else{ cid2 = -1; }
			cid = rec_character(image, &cba_raw[i+1]);
			if ( cid >= 0 ){
				recResult2 = Rec.resultTable[cid];
//				dist2 += recResult2.dist * recResult2.dist;
				dist2 += recResult2.dist;
			}
			else{ cid2 = -1; }
//			dist2 = sqrt(dist2);
			dist2 /= 2;
		}
		else{ cid2 = -1; }
		if ( cid2 == -1 )  dist2 = 1.0e30;
		cid = rec_character(image, cb);
		if ( cid == -2 )  break;
		if ( cid >= 0 ){
			recResult_c = Rec.resultTable[cid];
			dist1 = recResult_c.dist;
		}
		else{  dist1 = 1.0e30; }

		if ( dist1 == 1.0e30 && dist2 == 1.0e30 ){
			if ( 0 > rec_addstr(resultline, ".", bufsize) ){
				break;
			}
		}
//		else if ( dist1 > 1 * dist2 ){
		else if ( dist1 > 1.3 * dist2 ){
			if ( 0 > rec_addstr(resultline, \
				cclist[recResult1.id].ccode, bufsize) ){
				break;
			}
			if ( 0 > rec_addstr(resultline, \
				cclist[recResult2.id].ccode, bufsize) ){
				break;
			}

		}
		else{	if ( 0 > rec_addstr(resultline, \
				cclist[recResult_c.id].ccode, bufsize) ){
				break;
			}
//	printf("%s	%.4f\n",cclist[recResult_c.id].ccode, recResult_c.dist);
		}
	}

	if ( cblist )  delete_cblist(cblist);
	delete []cba;
	return(0);
}




