서녘마리

C# 테서렉트 5.0.0 버전 사용하기 본문

리뷰

C# 테서렉트 5.0.0 버전 사용하기

서녘마리 2020. 1. 3. 13:51
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
class TesseractService
    {
        private readonly string _tesseractExePath;
        private readonly string _language;
 
        /// <summary>
        /// Initializes a new instance of the <see cref="TesseractService"/> class.
        /// </summary>
        /// <param name="tesseractDir">The path for the Tesseract4 installation folder (C:\Program Files\Tesseract-OCR).</param>
        /// <param name="language">The language used to extract text from images (eng, por, etc)</param>
        /// <param name="dataDir">The data with the trained models (tessdata). Download the models from https://github.com/tesseract-ocr/tessdata_fast</param>
        public TesseractService(string tesseractDir, string language = "en"string dataDir = null)
        {
            // Tesseract configs.
            _tesseractExePath = Path.Combine(tesseractDir, "tesseract.exe");
            _language = language;
 
            if (String.IsNullOrEmpty(dataDir))
                dataDir = Path.Combine(tesseractDir, "tessdata");
 
            Environment.SetEnvironmentVariable("TESSDATA_PREFIX", dataDir);
           
        }
 
        /// <summary>
        /// Read text from the images streams.
        /// </summary>
        /// <param name="images">The images streams.</param>
        /// <returns>The images text.</returns>
        public string GetText(params Stream[] images)
        {
            var output = string.Empty;
 
            if (images.Any())
            {
                var tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
                Directory.CreateDirectory(tempPath);
                var tempInputFile = NewTempFileName(tempPath);
                var tempOutputFile = NewTempFileName(tempPath);
 
                try
                {
                    WriteInputFiles(images, tempPath, tempInputFile);
 
                    var info = new ProcessStartInfo
                    {
                        FileName = _tesseractExePath,
                        Arguments = $"{tempInputFile} {tempOutputFile} -l {_language}",
                        RedirectStandardError = true,
                        RedirectStandardOutput = true,
                        CreateNoWindow = true,
                        UseShellExecute = false
                    };
 
                    using (var ps = Process.Start(info))
                    {
                        ps.WaitForExit();
 
                        var exitCode = ps.ExitCode;
 
                        if (exitCode == 0)
                        {
                            output = File.ReadAllText(tempOutputFile + ".txt");
                        }
                        else
                        {
                            var stderr = ps.StandardError.ReadToEnd();
                            throw new InvalidOperationException(stderr);
                        }
                    }
                }
                finally
                {
                    Directory.Delete(tempPath, true);
                }
            }
 
            return output;
        }
 
        private static void WriteInputFiles(Stream[] inputStreams, string tempPath, string tempInputFile)
        {
            // If there is more thant one image file, so build the list file using the images as input files.
            if (inputStreams.Length > 1)
            {
                var imagesListFileContent = new StringBuilder();
 
                foreach (var inputStream in inputStreams)
                {
                    var imageFile = NewTempFileName(tempPath);
 
                    using (var tempStream = File.OpenWrite(imageFile))
                    {
                        CopyStream(inputStream, tempStream);
                    }
 
                    imagesListFileContent.AppendLine(imageFile);
                }
 
                File.WriteAllText(tempInputFile, imagesListFileContent.ToString());
            }
            else
            {
                // If is only one image file, than use the image file as input file.
                using (var tempStream = File.OpenWrite(tempInputFile))
                {
                    CopyStream(inputStreams.First(), tempStream);
                }
            }
        }
 
        private static void CopyStream(Stream input, Stream output)
        {
            if (input.CanSeek)
                input.Seek(0SeekOrigin.Begin);
 
            input.CopyTo(output);
            input.Close();
        }
 
        private static string NewTempFileName(string tempPath)
        {
            return Path.Combine(tempPath, Guid.NewGuid().ToString());
        }
    }
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs

테서렉트 서비스 클래스

1
2
3
4
5
6
7
static string program = Thread.GetDomain().BaseDirectory + "Tesseract-OCR";
TesseractService service = new TesseractService(program, "kor", program + @"\tessdata");
private void button1_Click(object sender, EventArgs e)
        {
    var text = service.GetText(stream);
        }
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs

실제 사용

 

Nuget으로 제공되는 테서렉트와 테서렉트5.0.0 알파버전과 한국어 인식 능력 차이가 커서

테서렉트 5.0.0 버전을 C#으로 쓸수 있는 코드를 긁어왔습니다.

 

4.0부터 추가된 LSTM기반의 OCR엔진도 서비스부의 oem 커맨드 수정을 통해 사용가능합니다. 

 

테서렉트 5.0.0 -alpha 링크

https://github.com/tesseract-ocr/tesseract/releases/tag/5.0.0-alpha

Comments