017-01.手書き文字-日本語の1行の文字列の認識

このサイトの紹介と使い方


created: 2021/11/04 modified: 2021/12/09

概要

  1. 1行の手書きの日本語の文字列をテキストに変換します。
  2. 文字の重なりは対象外とします。
  3. 1行は任意の矩形領域でアノテーションされているものとします。

1行の文字列の認識プログラムソース

失敗:015-kana.py

  1. ターゲット文字列の左から学習した文字列画像の横幅から小さい領域から認識させると、途中で違う文字と認識してしまいました。
  2. 問題点の予想は次の通りです。
    1. 学習量が少な過ぎる。
    2. モデルの構築が適切ではない。
import os,glob,sys
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import model_from_json

import c006_env as env

def main():
    json_string = open( env.fs_json,'r' ).read()
    model = model_from_json(json_string)
    model.load_weights( env.fs_hdf5 )

    img = Image.open( env.target_file )
    img = img.convert( env.img_mode )

#    print( img.size[0],img.size[1] )
    sx  = img.size[0] * 60 / img.size[1]
    isx = img.size[0] * 60 / img.size[1]
#    print( int(sx),isx )
      
    img = img.resize( (int(sx),60) )
#    print( img.size[0],img.size[1] )

    leftx  = int(1)
    rightx = env.size_start
    charflag = -1

    while rightx < isx:
        print('start')
        maxright = leftx+env.size_end
        if( charflag == 0 ):
           leftx  = leftx + env.size_start
           rightx = leftx + env.size_start
           if( rightx>isx ):
               break
        print( leftx,rightx,maxright,isx )
        while rightx < maxright:
            if( rightx>isx ):
                break
            img_crop = img.crop( (leftx,1,rightx,60) )
            img_crop = img_crop.resize( (60,60) )
            data = np.asarray( img_crop )
            x2   = data.tolist()
            x1 = []
            x1.append( x2 )
            data = np.asarray( x1,dtype='int8' )
            data = data.astype( 'float32') /255
            pre = model.predict( data )
            idx2,max = env.best_char(pre[0])
            if( max>env.better_eval ):
               print( 'result:',env.groups[idx2],idx2,max )
               leftx = rightx+1
               rightx = leftx + env.size_start
               endx = leftx
               charflag = 1
               break
            else:
               charflag = 0

            rightx += 1

    print( 'success' )

main()
import sys,glob
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import model_from_json

cls         = sys.argv[1]
model_name  = sys.argv[2]
target_f    = sys.argv[3]

root_dir    = '/home/kurodon/handwritten/dendo'
file_type   = '/*.jpg'
img_mode    = 'RGB'
img_size    = 60
size_start  = int(img_size * 1 )
size_end    = int(img_size * 2 )

if( cls=="F04-01" ):
    target_file   = root_dir + '/target/single/' + target_f + file_type[2:]
    family        = "num_kana"
    train_dir     = root_dir+'/'+family+'/train/'
    test_dir      = root_dir+'/'+family+'/test/'
    family_dir    = [ family ]
    f_num_kana    = "0123456789"
    f_num_kana   += "アイウエオカキクケコサシスセソタチツテトナニヌネノ"
    f_num_kana   += "ハヒフヘホマミムメモヤユヨラリルレロワヲン"
    group_dir     = family
    groups        = f_num_kana
    family_member = [ f_num_kana ]
    family_name   = cls
    nb_classes    = len( f_num_kana )

family_io   = root_dir+'/io/'+family_name+'_'
npy_x_train = family_io+'x_train.npy'
npy_y_train = family_io+'y_train.npy'
fs_json     = family_io+model_name+'model.json'
fs_hdf5     = family_io+model_name+'model.hdf5'

best_eval   = 0.79
better_eval = 0.59
multi_eval  = 0.3

def best_char( pred ):
    max = 0.0
    idx = -1

    for ii in range(len(pred)):
        if( max<pred[ii] ):
           max = pred[ii]
           idx = ii

    return idx,max

def load_img3( rdir ):
    x1 = []
    y1 = []

    for idx,group in enumerate( groups ):
        img_dir = rdir + group
        files = glob.glob( img_dir+file_type )
        print( 'char {} '.format(group),format(len(files)) )
        for ii,ff in enumerate( files ):
            img = Image.open( ff )
            img = img.resize( (img_size,img_size) )
            img = img.convert( img_mode )
            data = np.asarray( img )
            x2   = data.tolist()
            x1.append( x2 )
            y1.append( idx )

    ximg  = np.array( x1,dtype='int8' )
    yimg  = np.array( y1,dtype='int8' )

    return ximg,yimg

def eval_char3( eval_dir ):
    json_string = open( fs_json,'r' ).read()
    model = model_from_json(json_string)
    model.load_weights( fs_hdf5 )

    nncor  = 0
    n1     = 0
    n2     = 0
    n3     = 0

    for jj,group in enumerate( groups ):
        img_dir = root_dir+ '/' + family + '/' + eval_dir + '/' + group
        files   = glob.glob( img_dir+file_type )
        nfiles  = len(files)
        ncor    = 0
        for kk,ff in enumerate(files):
            img = Image.open( ff )
            img = img.resize( (img_size,img_size) )
            img = img.convert( img_mode )
            data = np.asarray( img )
            x2   = data.tolist()
            x1 = []
            x1.append( x2 )
            data = np.asarray( x1,dtype='int8' )
            data = data.astype( 'float32') /255
            pre = model.predict( data )
            idx2,max = best_char(pre[0])
            if( jj==idx2 ):
               ncor += 1

        nncor += ncor
        e1     = float( ncor ) / float( nfiles )
        if( e1>best_eval ):
            n1 += 1
        elif( e1>better_eval ):
            n2 += 1
        else:
            n3 += 1
            print( group,ncor,' / ',nfiles )

    print( 'best char  :' ,n1,' / ',nb_classes, float(n1)/float(nb_classes) )
    print( 'better char:' ,n2,' / ',nb_classes, float(n2)/float(nb_classes) )
    print( 'wrong char :' ,n3,' / ',nb_classes, float(n3)/float(nb_classes) )
    print( 'total      :' ,nncor,' / ',nb_classes*nfiles,float(nncor)/float(nb_classes*nfiles) )

def alike_char( c_eval ):
    json_string = open( fs_json,'r' ).read()
    model = model_from_json(json_string)
    model.load_weights( fs_hdf5 )

    img_dir = root_dir+ '/' + family + '/test/' + c_eval
    files   = glob.glob( img_dir+file_type )
    nfiles  = len(files)
    for ii,ff in enumerate(files):
        img = Image.open( ff )
        img = img.resize( (img_size,img_size) )
        img = img.convert( img_mode )
        data = np.asarray( img )
        x2   = data.tolist()
        x1 = []
        x1.append( x2 )
        data = np.asarray( x1,dtype='int8' )
        data = data.astype( 'float32') /255

        pre = model.predict( data )
        cc = str(ii) + " " + c_eval + " "
        idx2,max = best_char(pre[0])
        cc += groups[idx2]
        cc += " "
        for jj in range(len(pre[0])):
            if( pre[0][jj]>multi_eval ):
                cc += str(pre[0][jj])
                cc += ","
        print( cc )

失敗:015-kana-02.py

  1. 文字間の空白を区切りとして、最初に文字を抽出しました。
  2. 問題:文字が左寄せ、中央寄せによって違う文字に認識されました。
    1. モデルの構築の見直しが必要です。
    2. 学習データをトリミングして作成してみます。
      そして、すべてのデータを左上寄せにします。
    3. 問題の複雑さの要因を増やさないためにアスペクト比は変えないようにします。
  3. プログラムをいじって、最適な文字位置にしたとき、"アイウエオ"が"アイウユオ"と認識されました。
  4. 最適な文字位置以外のときは、0%~40%の認識率でした。
import os,glob,sys
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import model_from_json

import c007_env as env

def main():

    env.separate_char()

    print( 'success' )

main()
import sys,glob
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import model_from_json

cls         = sys.argv[1]
model_name  = sys.argv[2]
target_f    = sys.argv[3]

root_dir    = '/home/kurodon/handwritten/dendo'
file_type   = '/*.jpg'
img_mode    = 'RGB'
img_size    = 60
size_start  = int(img_size * 1 )
size_end    = int(img_size * 2 )
max_char    = 1000
std_span    = 3

if( cls=="F04-01" ):
    target_file   = root_dir + '/target/single/' + target_f + file_type[2:]
    family        = "num_kana"
    train_dir     = root_dir+'/'+family+'/train/'
    test_dir      = root_dir+'/'+family+'/test/'
    family_dir    = [ family ]
    f_num_kana    = "0123456789"
    f_num_kana   += "アイウエオカキクケコサシスセソタチツテトナニヌネノ"
    f_num_kana   += "ハヒフヘホマミムメモヤユヨラリルレロワヲン"
    group_dir     = family
    groups        = f_num_kana
    family_member = [ f_num_kana ]
    family_name   = cls
    nb_classes    = len( f_num_kana )

family_io   = root_dir+'/io/'+family_name+'_'
npy_x_train = family_io+'x_train.npy'
npy_y_train = family_io+'y_train.npy'
fs_json     = family_io+model_name+'model.json'
fs_hdf5     = family_io+model_name+'model.hdf5'

best_eval   = 0.79
better_eval = 0.59
multi_eval  = 0.3

def best_char( pred ):
    max = 0.0
    idx = -1

    for ii in range(len(pred)):
        if( max<pred[ii] ):
           max = pred[ii]
           idx = ii

    return idx,max

def load_img3( rdir ):
    x1 = []
    y1 = []

    for idx,group in enumerate( groups ):
        img_dir = rdir + group
        files = glob.glob( img_dir+file_type )
        print( 'char {} '.format(group),format(len(files)) )
        for ii,ff in enumerate( files ):
            img = Image.open( ff )
            img = img.resize( (img_size,img_size) )
            img = img.convert( img_mode )
            data = np.asarray( img )
            x2   = data.tolist()
            x1.append( x2 )
            y1.append( idx )

    ximg  = np.array( x1,dtype='int8' )
    yimg  = np.array( y1,dtype='int8' )

    return ximg,yimg

def eval_char3( eval_dir ):
    json_string = open( fs_json,'r' ).read()
    model = model_from_json(json_string)
    model.load_weights( fs_hdf5 )

    nncor  = 0
    n1     = 0
    n2     = 0
    n3     = 0

    for jj,group in enumerate( groups ):
        img_dir = root_dir+ '/' + family + '/' + eval_dir + '/' + group
        files   = glob.glob( img_dir+file_type )
        nfiles  = len(files)
        ncor    = 0
        for kk,ff in enumerate(files):
            img = Image.open( ff )
            img = img.resize( (img_size,img_size) )
            img = img.convert( img_mode )
            data = np.asarray( img )
            x2   = data.tolist()
            x1 = []
            x1.append( x2 )
            data = np.asarray( x1,dtype='int8' )
            data = data.astype( 'float32') /255
            pre = model.predict( data )
            idx2,max = best_char(pre[0])
            if( jj==idx2 ):
               ncor += 1

        nncor += ncor
        e1     = float( ncor ) / float( nfiles )
        if( e1>best_eval ):
            n1 += 1
        elif( e1>better_eval ):
            n2 += 1
        else:
            n3 += 1
            print( group,ncor,' / ',nfiles )

    print( 'best char  :' ,n1,' / ',nb_classes, float(n1)/float(nb_classes) )
    print( 'better char:' ,n2,' / ',nb_classes, float(n2)/float(nb_classes) )
    print( 'wrong char :' ,n3,' / ',nb_classes, float(n3)/float(nb_classes) )
    print( 'total      :' ,nncor,' / ',nb_classes*nfiles,float(nncor)/float(nb_classes*nfiles) )

def alike_char( c_eval ):
    json_string = open( fs_json,'r' ).read()
    model = model_from_json(json_string)
    model.load_weights( fs_hdf5 )

    img_dir = root_dir+ '/' + family + '/test/' + c_eval
    files   = glob.glob( img_dir+file_type )
    nfiles  = len(files)
    for ii,ff in enumerate(files):
        img = Image.open( ff )
        img = img.resize( (img_size,img_size) )
        img = img.convert( img_mode )
        data = np.asarray( img )
        x2   = data.tolist()
        x1 = []
        x1.append( x2 )
        data = np.asarray( x1,dtype='int8' )
        data = data.astype( 'float32') /255

        pre = model.predict( data )
        cc = str(ii) + " " + c_eval + " "
        idx2,max = best_char(pre[0])
        cc += groups[idx2]
        cc += " "
        for jj in range(len(pre[0])):
            if( pre[0][jj]>multi_eval ):
                cc += str(pre[0][jj])
                cc += ","
        print( cc )

def separate_char():
    img  = Image.open( target_file )
    img  = img.convert( img_mode )
    sx   = img.size[0] * img_size / img.size[1]
#    isx  = int( img.size[0] )
    isx  = int( sx )
    img  = img.resize( (isx,img_size) )
    data = np.asarray( img )
    x2   = data.tolist()
    x1   = []
    x1.append( x2 )
    data = np.asarray( x1,dtype='int' )

    print( 'shape:',data.shape )

    npos    = np.zeros((2, max_char))
    npos    = np.asarray( npos,dtype='int' )
#    print( npos.shape )
    ipos    = -1
    ispan   = 0
    nchar   = 0
    iswitch = 0
    rflag   = 0

    while ipos<(isx-1):
        nn    = 0
        iflag = 0
        ipos += 1

        for jj in range( img_size ):
            nn = 0
#            if( ipos<10 ):
#                print( data[0][jj][ipos-1][0],data[0][jj][ipos-1][1],data[0][jj][ipos-1][2] )
            for kk in range( 3 ):
                nn += data[0][jj][ipos][kk]
            if( nn<(128*3) ):
                iflag = 1
                break

        if( iflag==0 ):
            ispan += 1
#            print( 'ispan:',ispan,ipos )
            if( rflag==1 ):
                if( ispan>std_span ):
#                    print( 'nchar end  :',nchar,'>',ipos )
                    npos[1][nchar] = ipos+1
                    rflag  = 0
                    nchar += 1
                    ispan = 0
        else:
            if( rflag==0 ):
                if( ispan>std_span ):
#                    print( 'nchar start:',nchar,'>',ipos )
                    npos[0][nchar] = ipos+1
                    rflag  = 1
                    ispan = 0

    print( 'nchar:',nchar )

    for ii in range(nchar):
        print( ii,npos[0][ii],npos[1][ii] )

    json_string = open( fs_json,'r' ).read()
    model = model_from_json( json_string )
    model.load_weights( fs_hdf5 )

    for ii in range(nchar):
        leftx  = npos[0][ii]
        rightx = npos[1][ii]
#        print( leftx,rightx )
        img_crop = img.crop( (leftx,1,rightx,img_size) )
#        img_crop = img_crop.resize( (img_size,img_size) )
        img_crop2 = Image.new( img_mode, (img_size, img_size), (255,255,255) )
        img_crop2.paste( img_crop, (25,1) )
#        ff = root_dir + '/target/single/' + 'sep' + str(ii) + file_type[2:]
#        img_crop2.save( ff , quality=95 )
        data = np.asarray( img_crop2 )
        x2   = data.tolist()
        x1 = []
        x1.append( x2 )
        data = np.asarray( x1,dtype='int8' )
        data = data.astype( 'float32') /255
        pre = model.predict( data )
        idx2,max = best_char(pre[0])
        if( max>multi_eval ):
#        if( 1 ):
           print( 'result:',groups[idx2],idx2,max )

前の記事:
次の記事:

お問合せ・御要望

  • お問合せ
  • トップへ戻る
    タイトルとURLをコピーしました