Infinity-Parser-Demo

Running

GiantPandas commited on 29 days ago

Commit

0a8a858

verified ·

1 Parent(s): 7bc972d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -47,6 +47,20 @@ setup_poppler_linux()
 preset_prompts = [
     "Please convert the document into Markdown format.",
     "Generate a clean and structured Markdown version of the document.",
     "Transform this content into Markdown with proper headings and bullet points.",

 preset_prompts = [
+"""
+Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+1. Bbox format: [x1, y1, x2, y2]
+2. Layout Categories: The possible categories are ['header', 'title', 'text', 'figure', 'table', 'formula', 'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote', 'table_footnote', 'page_footnote', 'footer'].
+3. Text Extraction & Formatting Rules:
+    - Figure: For the 'figure' category, the text field should be empty string.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
+4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+5. Final Output: The entire output must be a single JSON object.
+""",
     "Please convert the document into Markdown format.",
     "Generate a clean and structured Markdown version of the document.",
     "Transform this content into Markdown with proper headings and bullet points.",