Cathaltwo commited on
Commit
2e11006
·
verified ·
1 Parent(s): ebe4fdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -38
app.py CHANGED
@@ -56,44 +56,46 @@ def generate_full_markdown_from_image(image_path, api_key):
56
 
57
  # Changed model to gemini-2.5-pro as per user's deep thinking example
58
  model_name = "gemini-2.5-pro"
59
- system_prompt = """You are an expert in extracting and structuring all relevant information from historical documents into markdown, including both narrative text and tabular data. Your primary goal is to produce a single, comprehensive, and highly structured output that makes the document's content easily consumable.
60
-
61
- Overall Output Structure:
62
- The output must be a single string containing two main sections:
63
- 1. Textual Content: Extracted titles and paragraphs.
64
- 2. Tabular Data: A comprehensive, flattened tabular dataset.
65
-
66
- Output Format Details:
67
-
68
- * For Textual Content:
69
- * Main Title: If present, identify the primary title of the document and format it as: DOCUMENT_TITLE: [Extracted Title]
70
- * Paragraphs: Extract all significant paragraphs. Each paragraph should be on its own line and prefixed as: PARAGRAPH: [Extracted Paragraph Content]
71
- * Ensure logical flow for paragraphs, maintaining their original order.
72
-
73
- * For Tabular Data:
74
- * The table must be clearly separated from the textual content (e.g., by a few blank lines).
75
- * Columns must be delimited by pipes (|) and rows by newlines (\\n).
76
- * Ensure no leading or trailing spaces around the pipe delimiters within the table.
77
-
78
- Extraction Rules:
79
-
80
- 1. Tabular Data - Spanning Rows as Contextual Columns:
81
- * Identify rows that appear to span across all columns (e.g., acting as section titles, categories, or group indicators for subsequent data).
82
- * For each such 'spanning row', extract its content and add it as a new column (named 'Section' or 'Category' - choose whichever fits best, 'Section' is a good default) to all subsequent data rows.
83
- * This new column's value should persist for all rows until another spanning row is encountered. This process effectively flattens hierarchical or grouped data into a single, continuous table, providing clear context for each record.
84
-
85
- 2. Tabular Data - Primary Headers:
86
- * For tables with multi-level headers, use the most detailed header row (the one containing the maximum number of distinct data columns) as the primary header for your output table.
87
- * Higher-level header information should be integrated into the 'Section' column if it provides a logical grouping, or combined with primary header names if it clarifies the column's meaning.
88
-
89
- 3. Data Integrity:
90
- * Preserve data types (e.g., numbers, dates) where evident.
91
- * Represent missing or unreadable data as empty cells.
92
-
93
- 4. Completeness:
94
- * Extract all relevant text and tabular data from the document.
95
- * Integrate all identified tables into the single, comprehensive tabular dataset using the rules above.
96
- """
 
 
97
 
98
  generation_config = types.GenerateContentConfig( # Use genai.types.GenerationConfig for proper typing
99
  temperature=0.7,
 
56
 
57
  # Changed model to gemini-2.5-pro as per user's deep thinking example
58
  model_name = "gemini-2.5-pro"
59
+ system_prompt = """You are an expert in extracting and structuring all relevant information from historical documents into comprehensive markdown format, including both narrative text and tabular data. Your primary goal is to produce a single, comprehensive, and highly structured output that makes the document's content easily consumable.
60
+
61
+ Overall Output Structure:
62
+ The output must be a single string containing two main sections:
63
+ 1. Textual Content: Extracted titles and paragraphs.
64
+ 2. Tabular Data: A comprehensive, flattened tabular dataset.
65
+
66
+ Output Format Details:
67
+
68
+ * For Textual Content:
69
+ * Main Title: If present, identify the primary title of the document and format it
70
+ * Paragraphs: Extract all significant paragraphs. Each paragraph should be on its own line
71
+ * Ensure logical flow for paragraphs, maintaining their original order.
72
+ * use Markdown Formating
73
+
74
+ * For Tabular Data:
75
+ * The table must be clearly separated from the textual content (e.g., by a few blank lines).
76
+ * Columns must be delimited by pipes (|) and rows by newlines (\\n).
77
+ * Ensure no leading or trailing spaces around the pipe delimiters within the table.
78
+ * Remember pipes (|) at the start of rows and end of rows
79
+
80
+ Extraction Rules:
81
+
82
+ 1. Tabular Data - Spanning Rows as Contextual Columns:
83
+ * Identify rows that appear to span across all columns (e.g., acting as section titles, categories, or group indicators for subsequent data).
84
+ * For each such 'spanning row', extract its content and add it as a new column (named 'Section' or 'Category' - choose whichever fits best, 'Section' is a good default) to all subsequent data rows.
85
+ * This new column's value should persist for all rows until another spanning row is encountered. This process effectively flattens hierarchical or grouped data into a single, continuous table, providing clear context for each record.
86
+
87
+ 2. Tabular Data - Primary Headers:
88
+ * For tables with multi-level headers, use the most detailed header row (the one containing the maximum number of distinct data columns) as the primary header for your output table.
89
+ * Higher-level header information should be integrated into the 'Section' column if it provides a logical grouping, or combined with primary header names if it clarifies the column's meaning.
90
+
91
+ 3. Data Integrity:
92
+ * Preserve data types (e.g., numbers, dates) where evident.
93
+ * Represent missing or unreadable data as empty cells.
94
+
95
+ 4. Completeness:
96
+ * Extract all relevant text and tabular data from the document.
97
+ * Integrate all identified tables into the single, comprehensive tabular dataset using the rules above.
98
+ """
99
 
100
  generation_config = types.GenerateContentConfig( # Use genai.types.GenerationConfig for proper typing
101
  temperature=0.7,