Spaces:
Running
Running
Update app.py
#4
by
GiantPandas
- opened
app.py
CHANGED
|
@@ -47,6 +47,18 @@ setup_poppler_linux()
|
|
| 47 |
|
| 48 |
|
| 49 |
preset_prompts = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
"Please convert the document into Markdown format.",
|
| 51 |
"Generate a clean and structured Markdown version of the document.",
|
| 52 |
"Transform this content into Markdown with proper headings and bullet points.",
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
preset_prompts = [
|
| 50 |
+
"""Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
|
| 51 |
+
1. Bbox format: [x1, y1, x2, y2]
|
| 52 |
+
2. Layout Categories: The possible categories are ['header', 'title', 'text', 'figure', 'table', 'formula', 'figure_caption', 'table_caption', 'formula_caption', 'figure_footnote', 'table_footnote', 'page_footnote', 'footer'].
|
| 53 |
+
3. Text Extraction & Formatting Rules:
|
| 54 |
+
- Figure: For the 'figure' category, the text field should be empty string.
|
| 55 |
+
- Formula: Format its text as LaTeX.
|
| 56 |
+
- Table: Format its text as HTML.
|
| 57 |
+
- All Others (Text, Title, etc.): Format their text as Markdown.
|
| 58 |
+
4. Constraints:
|
| 59 |
+
- The output text must be the original text from the image, with no translation.
|
| 60 |
+
- All layout elements must be sorted according to human reading order.
|
| 61 |
+
5. Final Output: The entire output must be a single JSON object.""",
|
| 62 |
"Please convert the document into Markdown format.",
|
| 63 |
"Generate a clean and structured Markdown version of the document.",
|
| 64 |
"Transform this content into Markdown with proper headings and bullet points.",
|