2using System.Collections;
3using System.Collections.Generic;
4using System.Runtime.InteropServices;
6using System.Text.RegularExpressions;
8using System.Threading.Tasks;
24 this.Progress = progress;
65 if (ocrLanguage !=
null && RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && RuntimeInformation.ProcessArchitecture == Architecture.X86 && (cancellationToken !=
default || progress !=
null))
67 throw new PlatformNotSupportedException(
"A cancellationToken or a progress callback are not supported on Windows x86!");
72 IntPtr nativeStructuredPage = IntPtr.Zero;
76 if (ocrLanguage !=
null)
78 result = (
ExitCodes)NativeMethods.GetStructuredTextPageWithOCR(context.NativeContext, list.NativeDisplayList, ref nativeStructuredPage, ref blockCount, (
float)zoom, pageBounds.
X0, pageBounds.
Y0, pageBounds.
X1, pageBounds.
Y1,
"TESSDATA_PREFIX=" + ocrLanguage.
Prefix, ocrLanguage.
Language, prog =>
80 progress?.Report(
new OCRProgressInfo(prog / 100.0));
82 if (cancellationToken.IsCancellationRequested)
94 result = (
ExitCodes)NativeMethods.GetStructuredTextPage(context.NativeContext, list.NativeDisplayList, ref nativeStructuredPage, ref blockCount);
97 cancellationToken.ThrowIfCancellationRequested();
104 throw new MuPDFException(
"Cannot create page", result);
106 throw new MuPDFException(
"Cannot populate page", result);
108 throw new MuPDFException(
"Unknown error", result);
111 IntPtr[] blockPointers =
new IntPtr[blockCount];
112 GCHandle blocksHandle = GCHandle.Alloc(blockPointers, GCHandleType.Pinned);
116 result = (
ExitCodes)NativeMethods.GetStructuredTextBlocks(nativeStructuredPage, blocksHandle.AddrOfPinnedObject());
123 throw new MuPDFException(
"Unknown error", result);
128 for (
int i = 0; i < blockCount; i++)
137 result = (
ExitCodes)NativeMethods.GetStructuredTextBlock(blockPointers[i], ref type, ref x0, ref y0, ref x1, ref y1, ref lineCount);
144 throw new MuPDFException(
"Unknown error", result);
147 Rectangle bBox =
new Rectangle(x0, y0, x1, y1);
149 switch ((MuPDFStructuredTextBlock.Types)type)
151 case MuPDFStructuredTextBlock.Types.Image:
154 case MuPDFStructuredTextBlock.Types.Text:
155 this.
StructuredTextBlocks[i] =
new MuPDFTextStructuredTextBlock(bBox, blockPointers[i], lineCount);
165 NativeMethods.DisposeStructuredTextPage(context.NativeContext, nativeStructuredPage);
176 for (
int i = 0; i < this.
Count; i++)
180 if (
this[i].BoundingBox.Contains(point))
182 for (
int j = 0; j <
this[i].Count; j++)
184 if (
this[i][j].BoundingBox.Contains(point))
186 for (
int k = 0; k <
this[i][j].Count; k++)
188 if (
this[i][j][k].BoundingQuad.Contains(point))
210 float minDistance =
float.MaxValue;
213 float minBlockDistance =
float.MaxValue;
214 float minLineDistance =
float.MaxValue;
216 for (
int i = 0; i < this.
Count; i++)
220 float dx = Math.Max(0, Math.Max(
this[i].BoundingBox.X0 - point.
X, point.
X -
this[i].BoundingBox.X1));
221 float dy = Math.Max(0, Math.Max(
this[i].BoundingBox.Y0 - point.
Y, point.
Y -
this[i].BoundingBox.Y1));
222 float blockDist = dx * dx + dy * dy;
224 if (
this[i].BoundingBox.Contains(point) || blockDist < minBlockDistance)
226 if (blockDist < minBlockDistance)
228 minBlockDistance = blockDist;
229 minLineDistance =
float.MaxValue;
232 for (
int j = 0; j <
this[i].Count; j++)
234 dx = Math.Max(0, Math.Max(
this[i][j].BoundingBox.X0 - point.
X, point.
X -
this[i][j].BoundingBox.X1));
235 dy = Math.Max(0, Math.Max(
this[i][j].BoundingBox.Y0 - point.
Y, point.
Y -
this[i][j].BoundingBox.Y1));
236 float lineDist = dx * dx + dy * dy;
238 if (
this[i][j].BoundingBox.Contains(point) || lineDist < minLineDistance)
240 if (lineDist < minLineDistance)
242 minLineDistance = lineDist;
245 for (
int k = 0; k <
this[i][j].Count; k++)
247 if (
this[i][j][k].BoundingQuad.Contains(point))
254 float minDist = (point.X -
this[i][j][k].BoundingQuad.UpperLeft.X) * (point.
X -
this[i][j][k].BoundingQuad.UpperLeft.X) + (point.Y -
this[i][j][k].BoundingQuad.UpperLeft.Y) * (point.
Y -
this[i][j][k].BoundingQuad.UpperLeft.Y);
255 minDist = Math.Min(minDist, (point.
X -
this[i][j][k].BoundingQuad.UpperRight.X) * (point.X -
this[i][j][k].BoundingQuad.UpperRight.X) + (point.
Y -
this[i][j][k].BoundingQuad.UpperRight.Y) * (point.Y -
this[i][j][k].BoundingQuad.UpperRight.Y));
256 minDist = Math.Min(minDist, (point.
X -
this[i][j][k].BoundingQuad.LowerRight.X) * (point.X -
this[i][j][k].BoundingQuad.LowerRight.X) + (point.
Y -
this[i][j][k].BoundingQuad.LowerRight.Y) * (point.Y -
this[i][j][k].BoundingQuad.LowerRight.Y));
257 minDist = Math.Min(minDist, (point.
X -
this[i][j][k].BoundingQuad.LowerLeft.X) * (point.X -
this[i][j][k].BoundingQuad.LowerLeft.X) + (point.
Y -
this[i][j][k].BoundingQuad.LowerLeft.Y) * (point.Y -
this[i][j][k].BoundingQuad.LowerLeft.Y));
259 if (minDist < minDistance)
261 minDistance = minDist;
283 if (range ==
null || range.
End ==
null)
291 if (rangeEnd < rangeStart)
294 rangeStart = rangeEnd;
305 yield
return this[rangeStart.
BlockIndex].BoundingBox.ToQuad();
324 yield
return this[rangeStart.
BlockIndex][j].BoundingBox.ToQuad();
333 yield
return this[i].BoundingBox.ToQuad();
360 yield
return this[rangeStart.
BlockIndex][j].BoundingBox.ToQuad();
388 if (range ==
null || range.
End ==
null)
396 if (selectionEnd < selectionStart)
399 selectionStart = selectionEnd;
403 StringBuilder builder =
new StringBuilder();
427 builder.AppendLine();
432 builder.AppendLine(
this[selectionStart.
BlockIndex][j].ToString());
441 builder.Append(
this[i].ToString());
463 builder.AppendLine();
469 builder.AppendLine(
this[selectionStart.
BlockIndex][j].ToString());
489 return builder.ToString();
497 public IEnumerable<MuPDFStructuredTextAddressSpan>
Search(Regex needle)
500 for (
int i = 0; i < this.
Count; i++)
504 for (
int j = 0; j <
this[i].Count; j++)
506 foreach (Match match
in needle.Matches(
this[i][j].Text))
513 while (startIndex < match.Index)
515 startIndex +=
this[i][j][kStart].Character.Length;
519 if (startIndex > match.Index)
525 int kEnd = kStart - 1;
527 while (length < match.Length)
530 length +=
this[i][j][kEnd].Character.Length;
547 IEnumerator IEnumerable.GetEnumerator()
598 internal MuPDFStructuredTextBlock(Rectangle boundingBox)
600 this.BoundingBox = boundingBox;
606 IEnumerator IEnumerable.GetEnumerator()
636 throw new IndexOutOfRangeException(
"A structured text block containing an image only has one line!");
667 public override int Count => ((IReadOnlyCollection<MuPDFStructuredTextLine>)
Lines).Count;
674 IntPtr[] linePointers =
new IntPtr[lineCount];
675 GCHandle linesHandle = GCHandle.Alloc(linePointers, GCHandleType.Pinned);
679 ExitCodes result = (
ExitCodes)NativeMethods.GetStructuredTextLines(blockPointer, linesHandle.AddrOfPinnedObject());
691 for (
int i = 0; i < lineCount; i++)
704 result = (
ExitCodes)NativeMethods.GetStructuredTextLine(linePointers[i], ref wmode, ref x0, ref y0, ref x1, ref y1, ref x, ref y, ref charCount);
729 return ((IEnumerable<MuPDFStructuredTextLine>)
Lines).GetEnumerator();
738 StringBuilder text =
new StringBuilder();
742 text.AppendLine(line.
Text);
745 return text.ToString();
798 public int Count => ((IReadOnlyCollection<MuPDFStructuredTextCharacter>)
Characters).Count;
809 this.BoundingBox = boundingBox;
812 new MuPDFStructuredTextCharacter(0, -1,
new PointF(boundingBox.
X0, boundingBox.
Y1),
new Quad(
new PointF(boundingBox.
X0, boundingBox.
Y1),
new PointF(boundingBox.
X0, boundingBox.
Y0),
new PointF(boundingBox.
X1, boundingBox.
Y0),
new PointF(boundingBox.
X1, boundingBox.
Y1)), 9)
818 this.WritingMode = writingMode;
819 this.Direction = direction;
820 this.BoundingBox = boundingBox;
822 IntPtr[] charPointers =
new IntPtr[charCount];
823 GCHandle charsHandle = GCHandle.Alloc(charPointers, GCHandleType.Pinned);
827 ExitCodes result = (
ExitCodes)NativeMethods.GetStructuredTextChars(linePointer, charsHandle.AddrOfPinnedObject());
837 Characters =
new MuPDFStructuredTextCharacter[charCount];
839 StringBuilder textBuilder =
new StringBuilder(charCount);
841 for (
int i = 0; i < charCount; i++)
857 result = (
ExitCodes)NativeMethods.GetStructuredTextChar(charPointers[i], ref c, ref color, ref originX, ref originY, ref size, ref llX, ref llY, ref ulX, ref ulY, ref urX, ref urY, ref lrX, ref lrY);
864 throw new MuPDFException(
"Unknown error", result);
867 Quad quad =
new Quad(
new PointF(llX, llY),
new PointF(ulX, ulY),
new PointF(urX, urY),
new PointF(lrX, lrY));
868 PointF origin =
new PointF(originX, originY);
870 Characters[i] =
new MuPDFStructuredTextCharacter(c, color, origin, quad, size);
874 this.Text = textBuilder.ToString();
894 return ((IEnumerable<MuPDFStructuredTextCharacter>)
Characters).GetEnumerator();
897 IEnumerator IEnumerable.GetEnumerator()
940 this.CodePoint = codePoint;
941 this.Character = Char.ConvertFromUtf32(codePoint);
943 this.Origin = origin;
944 this.BoundingQuad = boundingQuad;
986 this.BlockIndex = blockIndex;
987 this.LineIndex = lineIndex;
988 this.CharacterIndex = characterIndex;
1002 else if (
this > other)
1180 return first.CharacterIndex == second.CharacterIndex && first.LineIndex == second.LineIndex && first.BlockIndex == second.
BlockIndex;
1191 return first.CharacterIndex != second.CharacterIndex || first.LineIndex != second.LineIndex || first.BlockIndex != second.
BlockIndex;
1199 return ((this.BlockIndex * 33 * 33) ^ this.LineIndex * 33) ^ this.
CharacterIndex;
1212 int newCharacterIndex = this.CharacterIndex + 1;
1214 if (page[newBlockIndex][newLineIndex].Count <= newCharacterIndex)
1216 newCharacterIndex = 0;
1220 if (page[newBlockIndex].Count <= newLineIndex)
1226 if (page.
Count <= newBlockIndex)
1241 return this.CharacterIndex == other.CharacterIndex && this.LineIndex == other.LineIndex && this.BlockIndex == other.
BlockIndex;
A wrapper around a MuPDF context object, which contains the exception stack and the resource cache st...
The exception that is thrown when a MuPDF operation fails.
Represents a block containing a single image. The block contains a single line with a single characte...
override IEnumerator< MuPDFStructuredTextLine > GetEnumerator()
Represents a range of characters in a MuPDFStructuredTextPage.
readonly? MuPDFStructuredTextAddress End
The address of the end of the range (inclusive), or null to signify an empty range.
readonly MuPDFStructuredTextAddress Start
The addres of the start of the range.
MuPDFStructuredTextAddressSpan(MuPDFStructuredTextAddress start, MuPDFStructuredTextAddress? end)
Creates a new MuPDFStructuredTextAddressSpan corresponding to the specified character range.
Represents a structured text block containing text or an image.
abstract IEnumerator< MuPDFStructuredTextLine > GetEnumerator()
abstract int Count
The number of lines in the block.
Types
Defines the type of the block.
Rectangle BoundingBox
The bounding box of the block.
abstract Types Type
The type of the block.
Represents a single text character.
override string ToString()
Returns a string representation of the character.
int Color
An sRGB hex representation of the colour of the character.
string Character
A string representation of the character. It may consist of a single char or of a surrogate pair of c...
Quad BoundingQuad
A quadrilater bound for the character. This may or may not be a rectangle.
PointF Origin
The baseline origin of the character.
int CodePoint
The unicode code point of the character.
Represents a single line of text (i.e. characters that share a common baseline).
MuPDFStructuredTextCharacter[] Characters
The characters contained in the line.
WritingModes WritingMode
The writing mode of the text.
WritingModes
Defines the writing mode of the text.
IEnumerator< MuPDFStructuredTextCharacter > GetEnumerator()
override string ToString()
Returns a string representation of the line.
PointF Direction
The normalised direction of the text baseline.
int Count
The number of characters in the line.
string Text
A string representation of the characters contained in the line.
Rectangle BoundingBox
The bounding box of the line.
Represents a structured representation of the text contained in a page.
MuPDFStructuredTextAddress? GetHitAddress(PointF point, bool includeImages)
Gets the address of the character that contains the specified point in page units.
MuPDFStructuredTextAddress? GetClosestHitAddress(PointF point, bool includeImages)
Gets the address of the character that contains the specified point in page units.
IEnumerator< MuPDFStructuredTextBlock > GetEnumerator()
MuPDFStructuredTextBlock[] StructuredTextBlocks
The blocks contained in the page.
int Count
The number of blocks in the page.
IEnumerable< MuPDFStructuredTextAddressSpan > Search(Regex needle)
Searches for the specified Regex in the text of the page. A single match cannot span multiple lines.
string GetText(MuPDFStructuredTextAddressSpan range)
Gets the text corresponding to the specified character range . Blocks containing images are ignored.
IEnumerable< Quad > GetHighlightQuads(MuPDFStructuredTextAddressSpan range, bool includeImages)
Gets a collection of Quads delimiting the specified character range . Where possible,...
Represents a block containing multiple lines of text (typically a paragraph).
MuPDFStructuredTextLine[] Lines
The lines of text in the block.
override IEnumerator< MuPDFStructuredTextLine > GetEnumerator()
override string ToString()
Returns the text contained in the block as a string.
double Progress
A value between 0 and 1, indicating how much progress has been completed.
Represents a language used by Tesseract OCR.
string Prefix
The name of the folder where the language file is located.
string Language
The name of the language. The Tesseract library will assume that the trained language data file can b...
ExitCodes
Exit codes returned by native methods describing various errors that can occur.
Represents the address of a particular character in a MuPDFStructuredTextPage, in terms of block inde...
override int GetHashCode()
readonly int CharacterIndex
The index of the character within the line.
bool Equals(MuPDFStructuredTextAddress other)
Compares the current MuPDFStructuredTextAddress with another MuPDFStructuredTextAddress.
static bool operator==(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
readonly int LineIndex
The index of the line within the block.
static bool operator<(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
static bool operator>=(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
static bool operator!=(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
static bool operator<=(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
readonly int BlockIndex
The index of the block.
MuPDFStructuredTextAddress? Increment(MuPDFStructuredTextPage page)
Returns a MuPDFStructuredTextAddress corresponding to the next character in the specified page.
override bool Equals(object other)
int CompareTo(MuPDFStructuredTextAddress other)
Compares this MuPDFStructuredTextAddress with another MuPDFStructuredTextAddress.
static bool operator>(MuPDFStructuredTextAddress first, MuPDFStructuredTextAddress second)
Compares two MuPDFStructuredTextAddress.
MuPDFStructuredTextAddress(int blockIndex, int lineIndex, int characterIndex)
Creates a new MuPDFStructuredTextAddress from the specified indices.
float X
The horizontal coordinate of the point.
float Y
The vertical coordinate of the point.
Represents a quadrilater (not necessarily a rectangle).
float Y0
The top coordinate of the rectangle.
float X1
The right coordinate of the rectangle.
float X0
The left coordinate of the rectangle.
float Y1
The bottom coordinate of the rectangle.
Represents the size of a rectangle.