MuPDFCore 1.8.0
Multiplatform .NET bindings for MuPDF
Loading...
Searching...
No Matches
MuPDFStructuredTextPage.cs
1using System;
2using System.Collections;
3using System.Collections.Generic;
4using System.Runtime.InteropServices;
5using System.Text;
6using System.Text.RegularExpressions;
7using System.Threading;
8using System.Threading.Tasks;
9
10namespace MuPDFCore
11{
12 /// <summary>
13 /// Describes OCR progress.
14 /// </summary>
15 public class OCRProgressInfo
16 {
17 /// <summary>
18 /// A value between 0 and 1, indicating how much progress has been completed.
19 /// </summary>
20 public double Progress { get; }
21
22 internal OCRProgressInfo(double progress)
23 {
24 this.Progress = progress;
25 }
26 }
27
28 /// <summary>
29 /// Represents a structured representation of the text contained in a page.
30 /// </summary>
31 public class MuPDFStructuredTextPage : IReadOnlyList<MuPDFStructuredTextBlock>
32 {
33 /// <summary>
34 /// The blocks contained in the page.
35 /// </summary>
36 public MuPDFStructuredTextBlock[] StructuredTextBlocks { get; private set; }
37
38 /// <summary>
39 /// The number of blocks in the page.
40 /// </summary>
41 public int Count => ((IReadOnlyCollection<MuPDFStructuredTextBlock>)StructuredTextBlocks).Count;
42
43 /// <summary>
44 /// Gets the specified block in the page.
45 /// </summary>
46 /// <param name="index">The index of the block.</param>
47 /// <returns>The block with the specified <paramref name="index"/>.</returns>
48 public MuPDFStructuredTextBlock this[int index] => ((IReadOnlyList<MuPDFStructuredTextBlock>)StructuredTextBlocks)[index];
49
50 /// <summary>
51 /// Gets the specified character in the page.
52 /// </summary>
53 /// <param name="address">The address (block, line and character index) of the character.</param>
54 /// <returns>A <see cref="MuPDFStructuredTextCharacter"/> representing the specified character.</returns>
56 {
57 get
58 {
59 return StructuredTextBlocks[address.BlockIndex][address.LineIndex][address.CharacterIndex];
60 }
61 }
62
63 internal MuPDFStructuredTextPage(MuPDFContext context, MuPDFDisplayList list, TesseractLanguage ocrLanguage, double zoom, Rectangle pageBounds, CancellationToken cancellationToken = default, IProgress<OCRProgressInfo> progress = null)
64 {
65 if (ocrLanguage != null && RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && RuntimeInformation.ProcessArchitecture == Architecture.X86 && (cancellationToken != default || progress != null))
66 {
67 throw new PlatformNotSupportedException("A cancellationToken or a progress callback are not supported on Windows x86!");
68 }
69
70 int blockCount = -1;
71
72 IntPtr nativeStructuredPage = IntPtr.Zero;
73
74 ExitCodes result;
75
76 if (ocrLanguage != null)
77 {
78 result = (ExitCodes)NativeMethods.GetStructuredTextPageWithOCR(context.NativeContext, list.NativeDisplayList, ref nativeStructuredPage, ref blockCount, (float)zoom, pageBounds.X0, pageBounds.Y0, pageBounds.X1, pageBounds.Y1, "TESSDATA_PREFIX=" + ocrLanguage.Prefix, ocrLanguage.Language, prog =>
79 {
80 progress?.Report(new OCRProgressInfo(prog / 100.0));
81
82 if (cancellationToken.IsCancellationRequested)
83 {
84 return 1;
85 }
86 else
87 {
88 return 0;
89 }
90 });
91 }
92 else
93 {
94 result = (ExitCodes)NativeMethods.GetStructuredTextPage(context.NativeContext, list.NativeDisplayList, ref nativeStructuredPage, ref blockCount);
95 }
96
97 cancellationToken.ThrowIfCancellationRequested();
98
99 switch (result)
100 {
101 case ExitCodes.EXIT_SUCCESS:
102 break;
103 case ExitCodes.ERR_CANNOT_CREATE_PAGE:
104 throw new MuPDFException("Cannot create page", result);
105 case ExitCodes.ERR_CANNOT_POPULATE_PAGE:
106 throw new MuPDFException("Cannot populate page", result);
107 default:
108 throw new MuPDFException("Unknown error", result);
109 }
110
111 IntPtr[] blockPointers = new IntPtr[blockCount];
112 GCHandle blocksHandle = GCHandle.Alloc(blockPointers, GCHandleType.Pinned);
113
114 try
115 {
116 result = (ExitCodes)NativeMethods.GetStructuredTextBlocks(nativeStructuredPage, blocksHandle.AddrOfPinnedObject());
117
118 switch (result)
119 {
120 case ExitCodes.EXIT_SUCCESS:
121 break;
122 default:
123 throw new MuPDFException("Unknown error", result);
124 }
125
126 StructuredTextBlocks = new MuPDFStructuredTextBlock[blockCount];
127
128 for (int i = 0; i < blockCount; i++)
129 {
130 int type = -1;
131 float x0 = -1;
132 float y0 = -1;
133 float x1 = -1;
134 float y1 = -1;
135 int lineCount = -1;
136
137 result = (ExitCodes)NativeMethods.GetStructuredTextBlock(blockPointers[i], ref type, ref x0, ref y0, ref x1, ref y1, ref lineCount);
138
139 switch (result)
140 {
141 case ExitCodes.EXIT_SUCCESS:
142 break;
143 default:
144 throw new MuPDFException("Unknown error", result);
145 }
146
147 Rectangle bBox = new Rectangle(x0, y0, x1, y1);
148
149 switch ((MuPDFStructuredTextBlock.Types)type)
150 {
151 case MuPDFStructuredTextBlock.Types.Image:
152 this.StructuredTextBlocks[i] = new MuPDFImageStructuredTextBlock(bBox);
153 break;
154 case MuPDFStructuredTextBlock.Types.Text:
155 this.StructuredTextBlocks[i] = new MuPDFTextStructuredTextBlock(bBox, blockPointers[i], lineCount);
156 break;
157 }
158 }
159 }
160 finally
161 {
162 blocksHandle.Free();
163 }
164
165 NativeMethods.DisposeStructuredTextPage(context.NativeContext, nativeStructuredPage);
166 }
167
168 /// <summary>
169 /// Gets the address of the character that contains the specified <paramref name="point"/> in page units.
170 /// </summary>
171 /// <param name="point">The point that must be contained by the character. This is expressed in page units (i.e. with a zoom factor of 1).</param>
172 /// <param name="includeImages">If this is <see langword="true"/>, blocks containing images may be returned. Otherwise, only blocks containing text are considered.</param>
173 /// <returns>The address of the character containing the specified <paramref name="point"/>, or <see langword="null"/> if no character contains the <paramref name="point"/>.</returns>
174 public MuPDFStructuredTextAddress? GetHitAddress(PointF point, bool includeImages)
175 {
176 for (int i = 0; i < this.Count; i++)
177 {
178 if (includeImages || this[i].Type == MuPDFStructuredTextBlock.Types.Text)
179 {
180 if (this[i].BoundingBox.Contains(point))
181 {
182 for (int j = 0; j < this[i].Count; j++)
183 {
184 if (this[i][j].BoundingBox.Contains(point))
185 {
186 for (int k = 0; k < this[i][j].Count; k++)
187 {
188 if (this[i][j][k].BoundingQuad.Contains(point))
189 {
190 return new MuPDFStructuredTextAddress(i, j, k);
191 }
192 }
193 }
194 }
195 }
196 }
197 }
198
199 return null;
200 }
201
202 /// <summary>
203 /// Gets the address of the character that contains the specified <paramref name="point"/> in page units.
204 /// </summary>
205 /// <param name="point">The point that must be closest to the character. This is expressed in page units (i.e. with a zoom factor of 1).</param>
206 /// <param name="includeImages">If this is <see langword="true"/>, blocks containing images may be returned. Otherwise, only blocks containing text are considered.</param>
207 /// <returns>The address of the character closest to the specified <paramref name="point"/> This is <see langword="null"/> only if the page contains no characters.</returns>
208 public MuPDFStructuredTextAddress? GetClosestHitAddress(PointF point, bool includeImages)
209 {
210 float minDistance = float.MaxValue;
211 MuPDFStructuredTextAddress? closestHit = null;
212
213 float minBlockDistance = float.MaxValue;
214 float minLineDistance = float.MaxValue;
215
216 for (int i = 0; i < this.Count; i++)
217 {
218 if (includeImages || this[i].Type == MuPDFStructuredTextBlock.Types.Text)
219 {
220 float dx = Math.Max(0, Math.Max(this[i].BoundingBox.X0 - point.X, point.X - this[i].BoundingBox.X1));
221 float dy = Math.Max(0, Math.Max(this[i].BoundingBox.Y0 - point.Y, point.Y - this[i].BoundingBox.Y1));
222 float blockDist = dx * dx + dy * dy;
223
224 if (this[i].BoundingBox.Contains(point) || blockDist < minBlockDistance)
225 {
226 if (blockDist < minBlockDistance)
227 {
228 minBlockDistance = blockDist;
229 minLineDistance = float.MaxValue;
230 }
231
232 for (int j = 0; j < this[i].Count; j++)
233 {
234 dx = Math.Max(0, Math.Max(this[i][j].BoundingBox.X0 - point.X, point.X - this[i][j].BoundingBox.X1));
235 dy = Math.Max(0, Math.Max(this[i][j].BoundingBox.Y0 - point.Y, point.Y - this[i][j].BoundingBox.Y1));
236 float lineDist = dx * dx + dy * dy;
237
238 if (this[i][j].BoundingBox.Contains(point) || lineDist < minLineDistance)
239 {
240 if (lineDist < minLineDistance)
241 {
242 minLineDistance = lineDist;
243 }
244
245 for (int k = 0; k < this[i][j].Count; k++)
246 {
247 if (this[i][j][k].BoundingQuad.Contains(point))
248 {
249 return new MuPDFStructuredTextAddress(i, j, k);
250 }
251 else
252 {
253 //The quads should be small enough that the error due to only checking vertices and not sides is negligible. Also, since the square root is monotonous, we can skip it.
254 float minDist = (point.X - this[i][j][k].BoundingQuad.UpperLeft.X) * (point.X - this[i][j][k].BoundingQuad.UpperLeft.X) + (point.Y - this[i][j][k].BoundingQuad.UpperLeft.Y) * (point.Y - this[i][j][k].BoundingQuad.UpperLeft.Y);
255 minDist = Math.Min(minDist, (point.X - this[i][j][k].BoundingQuad.UpperRight.X) * (point.X - this[i][j][k].BoundingQuad.UpperRight.X) + (point.Y - this[i][j][k].BoundingQuad.UpperRight.Y) * (point.Y - this[i][j][k].BoundingQuad.UpperRight.Y));
256 minDist = Math.Min(minDist, (point.X - this[i][j][k].BoundingQuad.LowerRight.X) * (point.X - this[i][j][k].BoundingQuad.LowerRight.X) + (point.Y - this[i][j][k].BoundingQuad.LowerRight.Y) * (point.Y - this[i][j][k].BoundingQuad.LowerRight.Y));
257 minDist = Math.Min(minDist, (point.X - this[i][j][k].BoundingQuad.LowerLeft.X) * (point.X - this[i][j][k].BoundingQuad.LowerLeft.X) + (point.Y - this[i][j][k].BoundingQuad.LowerLeft.Y) * (point.Y - this[i][j][k].BoundingQuad.LowerLeft.Y));
258
259 if (minDist < minDistance)
260 {
261 minDistance = minDist;
262 closestHit = new MuPDFStructuredTextAddress(i, j, k);
263 }
264 }
265 }
266 }
267 }
268 }
269 }
270 }
271
272 return closestHit;
273 }
274
275 /// <summary>
276 /// Gets a collection of <see cref="Quad"/>s delimiting the specified character <paramref name="range"/>. Where possible, these are collapsed at the line and block level. Each <see cref="Quad"/> may or may not be a rectangle.
277 /// </summary>
278 /// <param name="range">A <see cref="MuPDFStructuredTextAddressSpan"/> representing the character range</param>
279 /// <param name="includeImages">If this is <see langword="true"/>, the bounding boxes for blocks containing images are also returned. Otherwise, only blocks containing text are considered.</param>
280 /// <returns>A lazy collection of <see cref="Quad"/>s delimiting the characters in the specified <paramref name="includeImages"/>.</returns>
281 public IEnumerable<Quad> GetHighlightQuads(MuPDFStructuredTextAddressSpan range, bool includeImages)
282 {
283 if (range == null || range.End == null)
284 {
285 yield break;
286 }
287
288 MuPDFStructuredTextAddress rangeStart = range.Start;
289 MuPDFStructuredTextAddress rangeEnd = range.End.Value;
290
291 if (rangeEnd < rangeStart)
292 {
293 MuPDFStructuredTextAddress temp = rangeStart;
294 rangeStart = rangeEnd;
295 rangeEnd = temp;
296 }
297
298 if (rangeStart.BlockIndex != rangeEnd.BlockIndex)
299 {
300 //Add remaining part of this block
301 if (rangeStart.LineIndex == 0 && rangeStart.CharacterIndex == 0)
302 {
303 if (includeImages || this[rangeStart.BlockIndex].Type == MuPDFStructuredTextBlock.Types.Text)
304 {
305 yield return this[rangeStart.BlockIndex].BoundingBox.ToQuad();
306 }
307 }
308 else
309 {
310 if (rangeStart.CharacterIndex == 0)
311 {
312 yield return this[rangeStart.BlockIndex][rangeStart.LineIndex].BoundingBox.ToQuad();
313 }
314 else
315 {
316 for (int i = rangeStart.CharacterIndex; i < this[rangeStart.BlockIndex][rangeStart.LineIndex].Count; i++)
317 {
318 yield return this[rangeStart.BlockIndex][rangeStart.LineIndex][i].BoundingQuad;
319 }
320 }
321
322 for (int j = rangeStart.LineIndex + 1; j < this[rangeStart.BlockIndex].Count; j++)
323 {
324 yield return this[rangeStart.BlockIndex][j].BoundingBox.ToQuad();
325 }
326 }
327
328 //Add full blocks in the middle
329 for (int i = rangeStart.BlockIndex + 1; i < rangeEnd.BlockIndex; i++)
330 {
331 if (includeImages || this[i].Type == MuPDFStructuredTextBlock.Types.Text)
332 {
333 yield return this[i].BoundingBox.ToQuad();
334 }
335 }
336
337 rangeStart = new MuPDFStructuredTextAddress(rangeEnd.BlockIndex, 0, 0);
338 }
339
340 if (includeImages || this[rangeStart.BlockIndex].Type == MuPDFStructuredTextBlock.Types.Text)
341 {
342 if (rangeStart.LineIndex != rangeEnd.LineIndex)
343 {
344 //Add remaining part of this line
345 if (rangeStart.CharacterIndex == 0)
346 {
347 yield return this[rangeStart.BlockIndex][rangeStart.LineIndex].BoundingBox.ToQuad();
348 }
349 else
350 {
351 for (int i = rangeStart.CharacterIndex; i < this[rangeStart.BlockIndex][rangeStart.LineIndex].Count; i++)
352 {
353 yield return this[rangeStart.BlockIndex][rangeStart.LineIndex][i].BoundingQuad;
354 }
355 }
356
357 //Add full lines in the middle
358 for (int j = rangeStart.LineIndex + 1; j < rangeEnd.LineIndex; j++)
359 {
360 yield return this[rangeStart.BlockIndex][j].BoundingBox.ToQuad();
361 }
362
363 rangeStart = new MuPDFStructuredTextAddress(rangeEnd.BlockIndex, rangeEnd.LineIndex, 0);
364 }
365
366 //Add remaining part of this line
367 if (rangeStart.CharacterIndex == 0 && rangeEnd.CharacterIndex == this[rangeStart.BlockIndex][rangeStart.LineIndex].Count - 1)
368 {
369 yield return this[rangeStart.BlockIndex][rangeStart.LineIndex].BoundingBox.ToQuad();
370 }
371 else
372 {
373 for (int j = rangeStart.CharacterIndex; j <= rangeEnd.CharacterIndex; j++)
374 {
375 yield return this[rangeStart.BlockIndex][rangeStart.LineIndex][j].BoundingQuad;
376 }
377 }
378 }
379 }
380
381 /// <summary>
382 /// Gets the text corresponding to the specified character <paramref name="range"/>. Blocks containing images are ignored.
383 /// </summary>
384 /// <param name="range">A <see cref="MuPDFStructuredTextAddressSpan"/> representing the range of text to extract.</param>
385 /// <returns>A string representation of the text contained in the specified <paramref name="range"/>.</returns>
387 {
388 if (range == null || range.End == null)
389 {
390 return null;
391 }
392
393 MuPDFStructuredTextAddress selectionStart = range.Start;
394 MuPDFStructuredTextAddress selectionEnd = range.End.Value;
395
396 if (selectionEnd < selectionStart)
397 {
398 MuPDFStructuredTextAddress temp = selectionStart;
399 selectionStart = selectionEnd;
400 selectionEnd = temp;
401 }
402
403 StringBuilder builder = new StringBuilder();
404
405 if (selectionStart.BlockIndex != selectionEnd.BlockIndex)
406 {
407 //Add remaining part of this block
408 if (selectionStart.LineIndex == 0 && selectionStart.CharacterIndex == 0)
409 {
410 if (this[selectionStart.BlockIndex].Type == MuPDFStructuredTextBlock.Types.Text)
411 {
412 builder.Append(((MuPDFTextStructuredTextBlock)this[selectionStart.BlockIndex]).ToString());
413 }
414 }
415 else
416 {
417 if (selectionStart.CharacterIndex == 0)
418 {
419 builder.AppendLine(this[selectionStart.BlockIndex][selectionStart.LineIndex].ToString());
420 }
421 else
422 {
423 for (int i = selectionStart.CharacterIndex; i < this[selectionStart.BlockIndex][selectionStart.LineIndex].Count; i++)
424 {
425 builder.Append(this[selectionStart.BlockIndex][selectionStart.LineIndex][i].ToString());
426 }
427 builder.AppendLine();
428 }
429
430 for (int j = selectionStart.LineIndex + 1; j < this[selectionStart.BlockIndex].Count; j++)
431 {
432 builder.AppendLine(this[selectionStart.BlockIndex][j].ToString());
433 }
434 }
435
436 //Add full blocks in the middle
437 for (int i = selectionStart.BlockIndex + 1; i < selectionEnd.BlockIndex; i++)
438 {
439 if (this[i].Type == MuPDFStructuredTextBlock.Types.Text)
440 {
441 builder.Append(this[i].ToString());
442 }
443 }
444
445 selectionStart = new MuPDFStructuredTextAddress(selectionEnd.BlockIndex, 0, 0);
446 }
447
448 if (this[selectionStart.BlockIndex].Type == MuPDFStructuredTextBlock.Types.Text)
449 {
450 if (selectionStart.LineIndex != selectionEnd.LineIndex)
451 {
452 //Add remaining part of this line
453 if (selectionStart.CharacterIndex == 0)
454 {
455 builder.AppendLine(this[selectionStart.BlockIndex][selectionStart.LineIndex].ToString());
456 }
457 else
458 {
459 for (int i = selectionStart.CharacterIndex; i < this[selectionStart.BlockIndex][selectionStart.LineIndex].Count; i++)
460 {
461 builder.Append(this[selectionStart.BlockIndex][selectionStart.LineIndex][i].ToString());
462 }
463 builder.AppendLine();
464 }
465
466 //Add full lines in the middle
467 for (int j = selectionStart.LineIndex + 1; j < selectionEnd.LineIndex; j++)
468 {
469 builder.AppendLine(this[selectionStart.BlockIndex][j].ToString());
470 }
471
472 selectionStart = new MuPDFStructuredTextAddress(selectionEnd.BlockIndex, selectionEnd.LineIndex, 0);
473 }
474
475 //Add remaining part of this line
476 if (selectionStart.CharacterIndex == 0 && selectionEnd.CharacterIndex == this[selectionStart.BlockIndex][selectionStart.LineIndex].Count - 1)
477 {
478 builder.Append(this[selectionStart.BlockIndex][selectionStart.LineIndex].ToString());
479 }
480 else
481 {
482 for (int j = selectionStart.CharacterIndex; j <= selectionEnd.CharacterIndex; j++)
483 {
484 builder.Append(this[selectionStart.BlockIndex][selectionStart.LineIndex][j].ToString());
485 }
486 }
487 }
488
489 return builder.ToString();
490 }
491
492 /// <summary>
493 /// Searches for the specified <see cref="Regex"/> in the text of the page. A single match cannot span multiple lines.
494 /// </summary>
495 /// <param name="needle">The <see cref="Regex"/> to search for.</param>
496 /// <returns>A lazy collection of <see cref="MuPDFStructuredTextAddressSpan"/>s representing all the occurrences of the <paramref name="needle"/> in the text.</returns>
497 public IEnumerable<MuPDFStructuredTextAddressSpan> Search(Regex needle)
498 {
499
500 for (int i = 0; i < this.Count; i++)
501 {
502 if (this[i].Type == MuPDFStructuredTextBlock.Types.Text)
503 {
504 for (int j = 0; j < this[i].Count; j++)
505 {
506 foreach (Match match in needle.Matches(this[i][j].Text))
507 {
508 if (match.Success)
509 {
510 int startIndex = 0;
511 int kStart = 0;
512
513 while (startIndex < match.Index)
514 {
515 startIndex += this[i][j][kStart].Character.Length;
516 kStart++;
517 }
518
519 if (startIndex > match.Index)
520 {
521 kStart--;
522 }
523
524 int length = 0;
525 int kEnd = kStart - 1;
526
527 while (length < match.Length)
528 {
529 kEnd++;
530 length += this[i][j][kEnd].Character.Length;
531 }
532
533 yield return new MuPDFStructuredTextAddressSpan(new MuPDFStructuredTextAddress(i, j, kStart), new MuPDFStructuredTextAddress(i, j, kEnd));
534 }
535 }
536 }
537 }
538 }
539 }
540
541 /// <inheritdoc/>
542 public IEnumerator<MuPDFStructuredTextBlock> GetEnumerator()
543 {
544 return ((IEnumerable<MuPDFStructuredTextBlock>)StructuredTextBlocks).GetEnumerator();
545 }
546
547 IEnumerator IEnumerable.GetEnumerator()
548 {
550 }
551 }
552
553 /// <summary>
554 /// Represents a structured text block containing text or an image.
555 /// </summary>
556 public abstract class MuPDFStructuredTextBlock : IReadOnlyList<MuPDFStructuredTextLine>
557 {
558 /// <summary>
559 /// Defines the type of the block.
560 /// </summary>
561 public enum Types
562 {
563 /// <summary>
564 /// The block contains text.
565 /// </summary>
566 Text = 0,
567
568 /// <summary>
569 /// The block contains an image.
570 /// </summary>
571 Image = 1
572 }
573
574 /// <summary>
575 /// The type of the block.
576 /// </summary>
577 public abstract Types Type { get; }
578
579 /// <summary>
580 /// The bounding box of the block.
581 /// </summary>
582 public Rectangle BoundingBox { get; }
583
584 /// <summary>
585 /// The number of lines in the block.
586 /// </summary>
587 public abstract int Count { get; }
588
589 /// <summary>
590 /// Gets the specified line from the block.
591 /// </summary>
592 /// <param name="index">The index of the line to extract.</param>
593 /// <returns>The <see cref="MuPDFStructuredTextLine"/> with the specified <paramref name="index"/>.</returns>
594 public abstract MuPDFStructuredTextLine this[int index] { get; }
595
596 internal MuPDFStructuredTextBlock() { }
597
598 internal MuPDFStructuredTextBlock(Rectangle boundingBox)
599 {
600 this.BoundingBox = boundingBox;
601 }
602
603 /// <inheritdoc/>
604 public abstract IEnumerator<MuPDFStructuredTextLine> GetEnumerator();
605
606 IEnumerator IEnumerable.GetEnumerator()
607 {
608 return this.GetEnumerator();
609 }
610 }
611
612 /// <summary>
613 /// Represents a block containing a single image. The block contains a single line with a single character.
614 /// </summary>
616 {
617 /// <inheritdoc/>
618 public override Types Type => Types.Image;
619
620 /// <inheritdoc/>
621 public override int Count => 1;
622
623 private readonly MuPDFStructuredTextLine Line;
624
625 /// <inheritdoc/>
626 public override MuPDFStructuredTextLine this[int index]
627 {
628 get
629 {
630 if (index == 0)
631 {
632 return Line;
633 }
634 else
635 {
636 throw new IndexOutOfRangeException("A structured text block containing an image only has one line!");
637 }
638 }
639 }
640
641 internal MuPDFImageStructuredTextBlock(Rectangle boundingBox) : base(boundingBox)
642 {
643 this.Line = new MuPDFStructuredTextLine(this.BoundingBox);
644 }
645
646 /// <inheritdoc/>
647 public override IEnumerator<MuPDFStructuredTextLine> GetEnumerator()
648 {
649 return ((IEnumerable<MuPDFStructuredTextLine>)new MuPDFStructuredTextLine[] { Line }).GetEnumerator();
650 }
651 }
652
653 /// <summary>
654 /// Represents a block containing multiple lines of text (typically a paragraph).
655 /// </summary>
657 {
658 /// <inheritdoc/>
659 public override Types Type => Types.Text;
660
661 /// <summary>
662 /// The lines of text in the block.
663 /// </summary>
665
666 /// <inheritdoc/>
667 public override int Count => ((IReadOnlyCollection<MuPDFStructuredTextLine>)Lines).Count;
668
669 /// <inheritdoc/>
670 public override MuPDFStructuredTextLine this[int index] => ((IReadOnlyList<MuPDFStructuredTextLine>)Lines)[index];
671
672 internal MuPDFTextStructuredTextBlock(Rectangle boundingBox, IntPtr blockPointer, int lineCount) : base(boundingBox)
673 {
674 IntPtr[] linePointers = new IntPtr[lineCount];
675 GCHandle linesHandle = GCHandle.Alloc(linePointers, GCHandleType.Pinned);
676
677 try
678 {
679 ExitCodes result = (ExitCodes)NativeMethods.GetStructuredTextLines(blockPointer, linesHandle.AddrOfPinnedObject());
680
681 switch (result)
682 {
683 case ExitCodes.EXIT_SUCCESS:
684 break;
685 default:
686 throw new MuPDFException("Unknown error", result);
687 }
688
689 Lines = new MuPDFStructuredTextLine[lineCount];
690
691 for (int i = 0; i < lineCount; i++)
692 {
693 int wmode = -1;
694 float x0 = -1;
695 float y0 = -1;
696 float x1 = -1;
697 float y1 = -1;
698
699 float x = -1;
700 float y = -1;
701
702 int charCount = -1;
703
704 result = (ExitCodes)NativeMethods.GetStructuredTextLine(linePointers[i], ref wmode, ref x0, ref y0, ref x1, ref y1, ref x, ref y, ref charCount);
705
706 switch (result)
707 {
708 case ExitCodes.EXIT_SUCCESS:
709 break;
710 default:
711 throw new MuPDFException("Unknown error", result);
712 }
713
714 Rectangle bBox = new Rectangle(x0, y0, x1, y1);
715 PointF direction = new PointF(x, y);
716
717 Lines[i] = new MuPDFStructuredTextLine(linePointers[i], (MuPDFStructuredTextLine.WritingModes)wmode, direction, bBox, charCount);
718 }
719 }
720 finally
721 {
722 linesHandle.Free();
723 }
724 }
725
726 /// <inheritdoc/>
727 public override IEnumerator<MuPDFStructuredTextLine> GetEnumerator()
728 {
729 return ((IEnumerable<MuPDFStructuredTextLine>)Lines).GetEnumerator();
730 }
731
732 /// <summary>
733 /// Returns the text contained in the block as a <see cref="string"/>.
734 /// </summary>
735 /// <returns>The text contained in the block as a <see cref="string"/>. If the block contains at least one line, the return value has a line terminator at the end.</returns>
736 public override string ToString()
737 {
738 StringBuilder text = new StringBuilder();
739
740 foreach (MuPDFStructuredTextLine line in this)
741 {
742 text.AppendLine(line.Text);
743 }
744
745 return text.ToString();
746 }
747 }
748
749 /// <summary>
750 /// Represents a single line of text (i.e. characters that share a common baseline).
751 /// </summary>
752 public class MuPDFStructuredTextLine : IReadOnlyList<MuPDFStructuredTextCharacter>
753 {
754 /// <summary>
755 /// Defines the writing mode of the text.
756 /// </summary>
757 public enum WritingModes
758 {
759 /// <summary>
760 /// The text is written horizontally.
761 /// </summary>
762 Horizontal = 0,
763
764 /// <summary>
765 /// The text is written vertically.
766 /// </summary>
767 Vertical = 1
768 }
769
770 /// <summary>
771 /// The writing mode of the text.
772 /// </summary>
773 public WritingModes WritingMode { get; }
774
775 /// <summary>
776 /// The normalised direction of the text baseline.
777 /// </summary>
778 public PointF Direction { get; }
779
780 /// <summary>
781 /// The bounding box of the line.
782 /// </summary>
783 public Rectangle BoundingBox { get; }
784
785 /// <summary>
786 /// The characters contained in the line.
787 /// </summary>
789
790 /// <summary>
791 /// A string representation of the characters contained in the line.
792 /// </summary>
793 public string Text { get; }
794
795 /// <summary>
796 /// The number of characters in the line.
797 /// </summary>
798 public int Count => ((IReadOnlyCollection<MuPDFStructuredTextCharacter>)Characters).Count;
799
800 /// <summary>
801 /// Gets the specified character from the line.
802 /// </summary>
803 /// <param name="index">The index of the character.</param>
804 /// <returns>The <see cref="MuPDFStructuredTextCharacter"/> with the specified <paramref name="index"/>.</returns>
805 public MuPDFStructuredTextCharacter this[int index] => ((IReadOnlyList<MuPDFStructuredTextCharacter>)Characters)[index];
806
807 internal MuPDFStructuredTextLine(Rectangle boundingBox)
808 {
809 this.BoundingBox = boundingBox;
810 this.Characters = new MuPDFStructuredTextCharacter[]
811 {
812 new MuPDFStructuredTextCharacter(0, -1, new PointF(boundingBox.X0, boundingBox.Y1), new Quad(new PointF(boundingBox.X0, boundingBox.Y1), new PointF(boundingBox.X0, boundingBox.Y0), new PointF(boundingBox.X1, boundingBox.Y0), new PointF(boundingBox.X1, boundingBox.Y1)), 9)
813 };
814 }
815
816 internal MuPDFStructuredTextLine(IntPtr linePointer, WritingModes writingMode, PointF direction, Rectangle boundingBox, int charCount)
817 {
818 this.WritingMode = writingMode;
819 this.Direction = direction;
820 this.BoundingBox = boundingBox;
821
822 IntPtr[] charPointers = new IntPtr[charCount];
823 GCHandle charsHandle = GCHandle.Alloc(charPointers, GCHandleType.Pinned);
824
825 try
826 {
827 ExitCodes result = (ExitCodes)NativeMethods.GetStructuredTextChars(linePointer, charsHandle.AddrOfPinnedObject());
828
829 switch (result)
830 {
831 case ExitCodes.EXIT_SUCCESS:
832 break;
833 default:
834 throw new MuPDFException("Unknown error", result);
835 }
836
837 Characters = new MuPDFStructuredTextCharacter[charCount];
838
839 StringBuilder textBuilder = new StringBuilder(charCount);
840
841 for (int i = 0; i < charCount; i++)
842 {
843 int c = -1;
844 int color = -1;
845 float originX = -1;
846 float originY = -1;
847 float size = -1;
848 float llX = -1;
849 float llY = -1;
850 float ulX = -1;
851 float ulY = -1;
852 float urX = -1;
853 float urY = -1;
854 float lrX = -1;
855 float lrY = -1;
856
857 result = (ExitCodes)NativeMethods.GetStructuredTextChar(charPointers[i], ref c, ref color, ref originX, ref originY, ref size, ref llX, ref llY, ref ulX, ref ulY, ref urX, ref urY, ref lrX, ref lrY);
858
859 switch (result)
860 {
861 case ExitCodes.EXIT_SUCCESS:
862 break;
863 default:
864 throw new MuPDFException("Unknown error", result);
865 }
866
867 Quad quad = new Quad(new PointF(llX, llY), new PointF(ulX, ulY), new PointF(urX, urY), new PointF(lrX, lrY));
868 PointF origin = new PointF(originX, originY);
869
870 Characters[i] = new MuPDFStructuredTextCharacter(c, color, origin, quad, size);
871 textBuilder.Append(Characters[i].Character);
872 }
873
874 this.Text = textBuilder.ToString();
875 }
876 finally
877 {
878 charsHandle.Free();
879 }
880 }
881
882 /// <summary>
883 /// Returns a string representation of the line.
884 /// </summary>
885 /// <returns>A string representation of the line.</returns>
886 public override string ToString()
887 {
888 return this.Text;
889 }
890
891 /// <inheritdoc/>
892 public IEnumerator<MuPDFStructuredTextCharacter> GetEnumerator()
893 {
894 return ((IEnumerable<MuPDFStructuredTextCharacter>)Characters).GetEnumerator();
895 }
896
897 IEnumerator IEnumerable.GetEnumerator()
898 {
899 return Characters.GetEnumerator();
900 }
901 }
902
903 /// <summary>
904 /// Represents a single text character.
905 /// </summary>
907 {
908 /// <summary>
909 /// The unicode code point of the character.
910 /// </summary>
911 public int CodePoint { get; }
912
913 /// <summary>
914 /// A string representation of the character. It may consist of a single <see cref="char"/> or of a surrogate pair of <see cref="char"/>s.
915 /// </summary>
916 public string Character { get; }
917
918 /// <summary>
919 /// An sRGB hex representation of the colour of the character.
920 /// </summary>
921 public int Color { get; }
922
923 /// <summary>
924 /// The baseline origin of the character.
925 /// </summary>
926 public PointF Origin { get; }
927
928 /// <summary>
929 /// A quadrilater bound for the character. This may or may not be a rectangle.
930 /// </summary>
931 public Quad BoundingQuad { get; }
932
933 /// <summary>
934 /// The size in points of the character.
935 /// </summary>
936 public float Size { get; }
937
938 internal MuPDFStructuredTextCharacter(int codePoint, int color, PointF origin, Quad boundingQuad, float size)
939 {
940 this.CodePoint = codePoint;
941 this.Character = Char.ConvertFromUtf32(codePoint);
942 this.Color = color;
943 this.Origin = origin;
944 this.BoundingQuad = boundingQuad;
945 this.Size = size;
946 }
947
948 /// <summary>
949 /// Returns a string representation of the character.
950 /// </summary>
951 /// <returns>A string representation of the character.</returns>
952 public override string ToString()
953 {
954 return this.Character;
955 }
956 }
957
958 /// <summary>
959 /// Represents the address of a particular character in a <see cref="MuPDFStructuredTextPage"/>, in terms of block index, line index and character index.
960 /// </summary>
961 public struct MuPDFStructuredTextAddress : IComparable<MuPDFStructuredTextAddress>, IEquatable<MuPDFStructuredTextAddress>
962 {
963 /// <summary>
964 /// The index of the block.
965 /// </summary>
966 public readonly int BlockIndex;
967
968 /// <summary>
969 /// The index of the line within the block.
970 /// </summary>
971 public readonly int LineIndex;
972
973 /// <summary>
974 /// The index of the character within the line.
975 /// </summary>
976 public readonly int CharacterIndex;
977
978 /// <summary>
979 /// Creates a new <see cref="MuPDFStructuredTextAddress"/> from the specified indices.
980 /// </summary>
981 /// <param name="blockIndex">The index of the block.</param>
982 /// <param name="lineIndex">The index of the line within the block.</param>
983 /// <param name="characterIndex">The index of the character within the line.</param>
984 public MuPDFStructuredTextAddress(int blockIndex, int lineIndex, int characterIndex)
985 {
986 this.BlockIndex = blockIndex;
987 this.LineIndex = lineIndex;
988 this.CharacterIndex = characterIndex;
989 }
990
991 /// <summary>
992 /// Compares this <see cref="MuPDFStructuredTextAddress"/> with another <see cref="MuPDFStructuredTextAddress"/>.
993 /// </summary>
994 /// <param name="other">The <see cref="MuPDFStructuredTextAddress"/> to compare with the current instance.</param>
995 /// <returns>-1 if the <paramref name="other"/> <see cref="MuPDFStructuredTextAddress"/> comes after the current instance, 1 if it comes before, or 0 if they represent the same address.</returns>
997 {
998 if (this < other)
999 {
1000 return -1;
1001 }
1002 else if (this > other)
1003 {
1004 return 1;
1005 }
1006 else
1007 {
1008 return 0;
1009 }
1010 }
1011
1012 /// <summary>
1013 /// Compares two <see cref="MuPDFStructuredTextAddress"/>.
1014 /// </summary>
1015 /// <param name="first">The first <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1016 /// <param name="second">The second <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1017 /// <returns><see langword="true"/> if the <paramref name="first"/> <see cref="MuPDFStructuredTextAddress"/> comes after the <paramref name="second"/> one; otherwise, <see langword="false"/>.</returns>
1019 {
1020 if (first.BlockIndex > second.BlockIndex)
1021 {
1022 return true;
1023 }
1024 else if (first.BlockIndex < second.BlockIndex)
1025 {
1026 return false;
1027 }
1028 else
1029 {
1030 if (first.LineIndex > second.LineIndex)
1031 {
1032 return true;
1033 }
1034 else if (first.LineIndex < second.LineIndex)
1035 {
1036 return false;
1037 }
1038 else
1039 {
1040 if (first.CharacterIndex > second.CharacterIndex)
1041 {
1042 return true;
1043 }
1044 else
1045 {
1046 return false;
1047 }
1048 }
1049 }
1050 }
1051
1052 /// <summary>
1053 /// Compares two <see cref="MuPDFStructuredTextAddress"/>.
1054 /// </summary>
1055 /// <param name="first">The first <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1056 /// <param name="second">The second <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1057 /// <returns><see langword="true"/> if the <paramref name="first"/> <see cref="MuPDFStructuredTextAddress"/> comes after the <paramref name="second"/> one or if they represent the same address; otherwise, <see langword="false"/>.</returns>
1059 {
1060 if (first.BlockIndex > second.BlockIndex)
1061 {
1062 return true;
1063 }
1064 else if (first.BlockIndex < second.BlockIndex)
1065 {
1066 return false;
1067 }
1068 else
1069 {
1070 if (first.LineIndex > second.LineIndex)
1071 {
1072 return true;
1073 }
1074 else if (first.LineIndex < second.LineIndex)
1075 {
1076 return false;
1077 }
1078 else
1079 {
1080 if (first.CharacterIndex >= second.CharacterIndex)
1081 {
1082 return true;
1083 }
1084 else
1085 {
1086 return false;
1087 }
1088 }
1089 }
1090 }
1091
1092 /// <summary>
1093 /// Compares two <see cref="MuPDFStructuredTextAddress"/>.
1094 /// </summary>
1095 /// <param name="first">The first <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1096 /// <param name="second">The second <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1097 /// <returns><see langword="true"/> if the <paramref name="first"/> <see cref="MuPDFStructuredTextAddress"/> comes before the <paramref name="second"/> one; otherwise, <see langword="false"/>.</returns>
1099 {
1100 if (first.BlockIndex > second.BlockIndex)
1101 {
1102 return false;
1103 }
1104 else if (first.BlockIndex < second.BlockIndex)
1105 {
1106 return true;
1107 }
1108 else
1109 {
1110 if (first.LineIndex > second.LineIndex)
1111 {
1112 return false;
1113 }
1114 else if (first.LineIndex < second.LineIndex)
1115 {
1116 return true;
1117 }
1118 else
1119 {
1120 if (first.CharacterIndex < second.CharacterIndex)
1121 {
1122 return true;
1123 }
1124 else
1125 {
1126 return false;
1127 }
1128 }
1129 }
1130 }
1131
1132 /// <summary>
1133 /// Compares two <see cref="MuPDFStructuredTextAddress"/>.
1134 /// </summary>
1135 /// <param name="first">The first <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1136 /// <param name="second">The second <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1137 /// <returns><see langword="true"/> if the <paramref name="first"/> <see cref="MuPDFStructuredTextAddress"/> comes before the <paramref name="second"/> one or if they represent the same address; otherwise, <see langword="false"/>.</returns>
1139 {
1140 if (first.BlockIndex > second.BlockIndex)
1141 {
1142 return false;
1143 }
1144 else if (first.BlockIndex < second.BlockIndex)
1145 {
1146 return true;
1147 }
1148 else
1149 {
1150 if (first.LineIndex > second.LineIndex)
1151 {
1152 return false;
1153 }
1154 else if (first.LineIndex < second.LineIndex)
1155 {
1156 return true;
1157 }
1158 else
1159 {
1160 if (first.CharacterIndex <= second.CharacterIndex)
1161 {
1162 return true;
1163 }
1164 else
1165 {
1166 return false;
1167 }
1168 }
1169 }
1170 }
1171
1172 /// <summary>
1173 /// Compares two <see cref="MuPDFStructuredTextAddress"/>.
1174 /// </summary>
1175 /// <param name="first">The first <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1176 /// <param name="second">The second <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1177 /// <returns><see langword="true"/> if the two <see cref="MuPDFStructuredTextAddress"/>es represent the same address; otherwise, <see langword="false"/>.</returns>
1179 {
1180 return first.CharacterIndex == second.CharacterIndex && first.LineIndex == second.LineIndex && first.BlockIndex == second.BlockIndex;
1181 }
1182
1183 /// <summary>
1184 /// Compares two <see cref="MuPDFStructuredTextAddress"/>.
1185 /// </summary>
1186 /// <param name="first">The first <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1187 /// <param name="second">The second <see cref="MuPDFStructuredTextAddress"/> to compare.</param>
1188 /// <returns><see langword="true"/> if the two <see cref="MuPDFStructuredTextAddress"/>es represent different addresses; otherwise, <see langword="false"/>.</returns>
1190 {
1191 return first.CharacterIndex != second.CharacterIndex || first.LineIndex != second.LineIndex || first.BlockIndex != second.BlockIndex;
1192 }
1193
1194 /// <inheritdoc/>
1195 public override int GetHashCode()
1196 {
1197 unchecked
1198 {
1199 return ((this.BlockIndex * 33 * 33) ^ this.LineIndex * 33) ^ this.CharacterIndex;
1200 }
1201 }
1202
1203 /// <summary>
1204 /// Returns a <see cref="MuPDFStructuredTextAddress"/> corresponding to the next character in the specified page.
1205 /// </summary>
1206 /// <param name="page">The page the address refers to.</param>
1207 /// <returns>A <see cref="MuPDFStructuredTextAddress"/> corresponding to the next character in the specified page.</returns>
1209 {
1210 int newBlockIndex = this.BlockIndex;
1211 int newLineIndex = this.LineIndex;
1212 int newCharacterIndex = this.CharacterIndex + 1;
1213
1214 if (page[newBlockIndex][newLineIndex].Count <= newCharacterIndex)
1215 {
1216 newCharacterIndex = 0;
1217 newLineIndex++;
1218 }
1219
1220 if (page[newBlockIndex].Count <= newLineIndex)
1221 {
1222 newLineIndex = 0;
1223 newBlockIndex++;
1224 }
1225
1226 if (page.Count <= newBlockIndex)
1227 {
1228 return null;
1229 }
1230
1231 return new MuPDFStructuredTextAddress(newBlockIndex, newLineIndex, newCharacterIndex);
1232 }
1233
1234 /// <summary>
1235 /// Compares the current <see cref="MuPDFStructuredTextAddress"/> with another <see cref="MuPDFStructuredTextAddress"/>.
1236 /// </summary>
1237 /// <param name="other">The other <see cref="MuPDFStructuredTextAddress"/> to compare with the current instance.</param>
1238 /// <returns><see langword="true"/> if the two <see cref="MuPDFStructuredTextAddress"/>es represent the same address; otherwise, <see langword="false"/>.</returns>
1240 {
1241 return this.CharacterIndex == other.CharacterIndex && this.LineIndex == other.LineIndex && this.BlockIndex == other.BlockIndex;
1242 }
1243
1244 /// <inheritdoc/>
1245 public override bool Equals(object other)
1246 {
1247 return other is MuPDFStructuredTextAddress otherAddress && Equals(otherAddress);
1248 }
1249 }
1250
1251 /// <summary>
1252 /// Represents a range of characters in a <see cref="MuPDFStructuredTextPage"/>.
1253 /// </summary>
1255 {
1256 /// <summary>
1257 /// The addres of the start of the range.
1258 /// </summary>
1260
1261 /// <summary>
1262 /// The address of the end of the range (inclusive), or <see langword="null" /> to signify an empty range.
1263 /// </summary>
1265
1266 /// <summary>
1267 /// Creates a new <see cref="MuPDFStructuredTextAddressSpan"/> corresponding to the specified character range.
1268 /// </summary>
1269 /// <param name="start">The addres of the start of the range.</param>
1270 /// <param name="end">The address of the end of the range (inclusive), or <see langword="null" /> to signify an empty range.</param>
1272 {
1273 this.Start = start;
1274 this.End = end;
1275 }
1276 }
1277}
A wrapper around a MuPDF context object, which contains the exception stack and the resource cache st...
Definition: MuPDFContext.cs:26
The exception that is thrown when a MuPDF operation fails.
Definition: MuPDF.cs:494
Represents a block containing a single image. The block contains a single line with a single characte...
override IEnumerator< MuPDFStructuredTextLine > GetEnumerator()
Represents a range of characters in a MuPDFStructuredTextPage.
readonly? MuPDFStructuredTextAddress End
The address of the end of the range (inclusive), or null to signify an empty range.
readonly MuPDFStructuredTextAddress Start
The addres of the start of the range.
MuPDFStructuredTextAddressSpan(MuPDFStructuredTextAddress start, MuPDFStructuredTextAddress? end)
Creates a new MuPDFStructuredTextAddressSpan corresponding to the specified character range.
Represents a structured text block containing text or an image.
abstract IEnumerator< MuPDFStructuredTextLine > GetEnumerator()
abstract int Count
The number of lines in the block.
Types
Defines the type of the block.
Rectangle BoundingBox
The bounding box of the block.
abstract Types Type
The type of the block.
Represents a single text character.
override string ToString()
Returns a string representation of the character.
int Color
An sRGB hex representation of the colour of the character.
string Character
A string representation of the character. It may consist of a single char or of a surrogate pair of c...
Quad BoundingQuad
A quadrilater bound for the character. This may or may not be a rectangle.
PointF Origin
The baseline origin of the character.
int CodePoint
The unicode code point of the character.
Represents a single line of text (i.e. characters that share a common baseline).
MuPDFStructuredTextCharacter[] Characters
The characters contained in the line.
WritingModes WritingMode
The writing mode of the text.
WritingModes
Defines the writing mode of the text.
IEnumerator< MuPDFStructuredTextCharacter > GetEnumerator()
override string ToString()
Returns a string representation of the line.
PointF Direction
The normalised direction of the text baseline.
int Count
The number of characters in the line.
string Text
A string representation of the characters contained in the line.
Rectangle BoundingBox
The bounding box of the line.
Represents a structured representation of the text contained in a page.
MuPDFStructuredTextAddress? GetHitAddress(PointF point, bool includeImages)
Gets the address of the character that contains the specified point in page units.
MuPDFStructuredTextAddress? GetClosestHitAddress(PointF point, bool includeImages)
Gets the address of the character that contains the specified point in page units.
IEnumerator< MuPDFStructuredTextBlock > GetEnumerator()
MuPDFStructuredTextBlock[] StructuredTextBlocks
The blocks contained in the page.
int Count
The number of blocks in the page.
IEnumerable< MuPDFStructuredTextAddressSpan > Search(Regex needle)
Searches for the specified Regex in the text of the page. A single match cannot span multiple lines.
string GetText(MuPDFStructuredTextAddressSpan range)
Gets the text corresponding to the specified character range . Blocks containing images are ignored.
IEnumerable< Quad > GetHighlightQuads(MuPDFStructuredTextAddressSpan range, bool includeImages)
Gets a collection of Quads delimiting the specified character range . Where possible,...
Represents a block containing multiple lines of text (typically a paragraph).
MuPDFStructuredTextLine[] Lines
The lines of text in the block.
override IEnumerator< MuPDFStructuredTextLine > GetEnumerator()
override string ToString()
Returns the text contained in the block as a string.
double Progress
A value between 0 and 1, indicating how much progress has been completed.
Represents a language used by Tesseract OCR.
string Prefix
The name of the folder where the language file is located.
string Language
The name of the language. The Tesseract library will assume that the trained language data file can b...
ExitCodes
Exit codes returned by native methods describing various errors that can occur.
Definition: MuPDF.cs:32
Represents the address of a particular character in a MuPDFStructuredTextPage, in terms of block inde...
readonly int CharacterIndex
The index of the character within the line.
bool Equals(MuPDFStructuredTextAddress other)
Compares the current MuPDFStructuredTextAddress with another MuPDFStructuredTextAddress.
static bool operator==(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
readonly int LineIndex
The index of the line within the block.
static bool operator<(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
static bool operator>=(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
static bool operator!=(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
static bool operator<=(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
readonly int BlockIndex
The index of the block.
MuPDFStructuredTextAddress? Increment(MuPDFStructuredTextPage page)
Returns a MuPDFStructuredTextAddress corresponding to the next character in the specified page.
int CompareTo(MuPDFStructuredTextAddress other)
Compares this MuPDFStructuredTextAddress with another MuPDFStructuredTextAddress.
static bool operator>(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
MuPDFStructuredTextAddress(int blockIndex, int lineIndex, int characterIndex)
Creates a new MuPDFStructuredTextAddress from the specified indices.
Represents a point.
Definition: Rectangles.cs:567
float X
The horizontal coordinate of the point.
Definition: Rectangles.cs:571
float Y
The vertical coordinate of the point.
Definition: Rectangles.cs:576
Represents a quadrilater (not necessarily a rectangle).
Definition: Rectangles.cs:594
Represents a rectangle.
Definition: Rectangles.cs:327
float Y0
The top coordinate of the rectangle.
Definition: Rectangles.cs:336
float X1
The right coordinate of the rectangle.
Definition: Rectangles.cs:341
float X0
The left coordinate of the rectangle.
Definition: Rectangles.cs:331
float Y1
The bottom coordinate of the rectangle.
Definition: Rectangles.cs:346
Represents the size of a rectangle.
Definition: Rectangles.cs:26