Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -56,44 +56,46 @@ def generate_full_markdown_from_image(image_path, api_key):
|
|
| 56 |
|
| 57 |
# Changed model to gemini-2.5-pro as per user's deep thinking example
|
| 58 |
model_name = "gemini-2.5-pro"
|
| 59 |
-
system_prompt = """You are an expert in extracting and structuring all relevant information from historical documents into markdown, including both narrative text and tabular data. Your primary goal is to produce a single, comprehensive, and highly structured output that makes the document's content easily consumable.
|
| 60 |
-
|
| 61 |
-
Overall Output Structure:
|
| 62 |
-
The output must be a single string containing two main sections:
|
| 63 |
-
1. Textual Content: Extracted titles and paragraphs.
|
| 64 |
-
2. Tabular Data: A comprehensive, flattened tabular dataset.
|
| 65 |
-
|
| 66 |
-
Output Format Details:
|
| 67 |
-
|
| 68 |
-
* For Textual Content:
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
| 97 |
|
| 98 |
generation_config = types.GenerateContentConfig( # Use genai.types.GenerationConfig for proper typing
|
| 99 |
temperature=0.7,
|
|
|
|
| 56 |
|
| 57 |
# Changed model to gemini-2.5-pro as per user's deep thinking example
|
| 58 |
model_name = "gemini-2.5-pro"
|
| 59 |
+
system_prompt = """You are an expert in extracting and structuring all relevant information from historical documents into comprehensive markdown format, including both narrative text and tabular data. Your primary goal is to produce a single, comprehensive, and highly structured output that makes the document's content easily consumable.
|
| 60 |
+
|
| 61 |
+
Overall Output Structure:
|
| 62 |
+
The output must be a single string containing two main sections:
|
| 63 |
+
1. Textual Content: Extracted titles and paragraphs.
|
| 64 |
+
2. Tabular Data: A comprehensive, flattened tabular dataset.
|
| 65 |
+
|
| 66 |
+
Output Format Details:
|
| 67 |
+
|
| 68 |
+
* For Textual Content:
|
| 69 |
+
* Main Title: If present, identify the primary title of the document and format it
|
| 70 |
+
* Paragraphs: Extract all significant paragraphs. Each paragraph should be on its own line
|
| 71 |
+
* Ensure logical flow for paragraphs, maintaining their original order.
|
| 72 |
+
* use Markdown Formating
|
| 73 |
+
|
| 74 |
+
* For Tabular Data:
|
| 75 |
+
* The table must be clearly separated from the textual content (e.g., by a few blank lines).
|
| 76 |
+
* Columns must be delimited by pipes (|) and rows by newlines (\\n).
|
| 77 |
+
* Ensure no leading or trailing spaces around the pipe delimiters within the table.
|
| 78 |
+
* Remember pipes (|) at the start of rows and end of rows
|
| 79 |
+
|
| 80 |
+
Extraction Rules:
|
| 81 |
+
|
| 82 |
+
1. Tabular Data - Spanning Rows as Contextual Columns:
|
| 83 |
+
* Identify rows that appear to span across all columns (e.g., acting as section titles, categories, or group indicators for subsequent data).
|
| 84 |
+
* For each such 'spanning row', extract its content and add it as a new column (named 'Section' or 'Category' - choose whichever fits best, 'Section' is a good default) to all subsequent data rows.
|
| 85 |
+
* This new column's value should persist for all rows until another spanning row is encountered. This process effectively flattens hierarchical or grouped data into a single, continuous table, providing clear context for each record.
|
| 86 |
+
|
| 87 |
+
2. Tabular Data - Primary Headers:
|
| 88 |
+
* For tables with multi-level headers, use the most detailed header row (the one containing the maximum number of distinct data columns) as the primary header for your output table.
|
| 89 |
+
* Higher-level header information should be integrated into the 'Section' column if it provides a logical grouping, or combined with primary header names if it clarifies the column's meaning.
|
| 90 |
+
|
| 91 |
+
3. Data Integrity:
|
| 92 |
+
* Preserve data types (e.g., numbers, dates) where evident.
|
| 93 |
+
* Represent missing or unreadable data as empty cells.
|
| 94 |
+
|
| 95 |
+
4. Completeness:
|
| 96 |
+
* Extract all relevant text and tabular data from the document.
|
| 97 |
+
* Integrate all identified tables into the single, comprehensive tabular dataset using the rules above.
|
| 98 |
+
"""
|
| 99 |
|
| 100 |
generation_config = types.GenerateContentConfig( # Use genai.types.GenerationConfig for proper typing
|
| 101 |
temperature=0.7,
|