Files changed (1) hide show
  1. app.py +12 -0
app.py CHANGED
@@ -47,6 +47,18 @@ setup_poppler_linux()
47
 
48
 
49
  preset_prompts = [
 
 
 
 
 
 
 
 
 
 
 
 
50
  "Please convert the document into Markdown format.",
51
  "Generate a clean and structured Markdown version of the document.",
52
  "Transform this content into Markdown with proper headings and bullet points.",
 
47
 
48
 
49
  preset_prompts = [
50
+ """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
51
+ 1. Bbox format: [x1, y1, x2, y2]
52
+ 2. Layout Categories: The possible categories are ['header', 'title', 'text', 'figure', 'table', 'formula', 'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote', 'table_footnote', 'page_footnote', 'footer'].
53
+ 3. Text Extraction & Formatting Rules:
54
+ - Figure: For the 'figure' category, the text field should be empty string.
55
+ - Formula: Format its text as LaTeX.
56
+ - Table: Format its text as HTML.
57
+ - All Others (Text, Title, etc.): Format their text as Markdown.
58
+ 4. Constraints:
59
+ - The output text must be the original text from the image, with no translation.
60
+ - All layout elements must be sorted according to human reading order.
61
+ 5. Final Output: The entire output must be a single JSON object.""",
62
  "Please convert the document into Markdown format.",
63
  "Generate a clean and structured Markdown version of the document.",
64
  "Transform this content into Markdown with proper headings and bullet points.",