GiantPandas commited on
Commit
0a8a858
·
verified ·
1 Parent(s): 7bc972d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -0
app.py CHANGED
@@ -47,6 +47,20 @@ setup_poppler_linux()
47
 
48
 
49
  preset_prompts = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  "Please convert the document into Markdown format.",
51
  "Generate a clean and structured Markdown version of the document.",
52
  "Transform this content into Markdown with proper headings and bullet points.",
 
47
 
48
 
49
  preset_prompts = [
50
+ """
51
+ Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
52
+ 1. Bbox format: [x1, y1, x2, y2]
53
+ 2. Layout Categories: The possible categories are ['header', 'title', 'text', 'figure', 'table', 'formula', 'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote', 'table_footnote', 'page_footnote', 'footer'].
54
+ 3. Text Extraction & Formatting Rules:
55
+ - Figure: For the 'figure' category, the text field should be empty string.
56
+ - Formula: Format its text as LaTeX.
57
+ - Table: Format its text as HTML.
58
+ - All Others (Text, Title, etc.): Format their text as Markdown.
59
+ 4. Constraints:
60
+ - The output text must be the original text from the image, with no translation.
61
+ - All layout elements must be sorted according to human reading order.
62
+ 5. Final Output: The entire output must be a single JSON object.
63
+ """,
64
  "Please convert the document into Markdown format.",
65
  "Generate a clean and structured Markdown version of the document.",
66
  "Transform this content into Markdown with proper headings and bullet points.",