Spaces:
Sleeping
Sleeping
File size: 19,289 Bytes
385769a 48ed553 b95691d 277e2a5 385769a b95691d 385769a 277e2a5 59c1497 277e2a5 751241b 59c1497 751241b 59c1497 ad30edb 59c1497 ad30edb 59c1497 ad30edb 751241b 59c1497 751241b 277e2a5 59c1497 751241b 59c1497 751241b 277e2a5 751241b 59c1497 ad30edb 751241b 277e2a5 59c1497 277e2a5 59c1497 277e2a5 59c1497 277e2a5 48ed553 b95691d 385769a b95691d 48ed553 385769a b95691d 48ed553 385769a b95691d 48ed553 385769a 48ed553 b95691d 48ed553 b95691d 48ed553 385769a b95691d 48ed553 b95691d 385769a b95691d 385769a b95691d 385769a b95691d 385769a 48ed553 b95691d 48ed553 b95691d 48ed553 b95691d 385769a 48ed553 b95691d 48ed553 fa24f2f b95691d bacee7b 48ed553 b95691d 48ed553 b95691d 59c1497 b95691d 385769a b95691d 385769a 277e2a5 385769a 277e2a5 385769a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 | import gradio as gr
import os
import json
import time
import re
from pageindex.core.tree_index import TreeIndex
from llm_config import get_llm_client, get_model_name
# Security: Check for APP_TOKEN env var
REQUIRED_TOKEN = os.getenv("APP_TOKEN", "849ejdkf2Audjo2Jf3jdoirfjh")
def extract_tables_from_markdown(markdown_text, token):
"""
Dedicated function to extract all tables from the markdown document.
Returns JSON array of table objects.
"""
if token != REQUIRED_TOKEN:
return json.dumps({"error": "Invalid Authentication Token", "tables": []})
if not markdown_text:
return json.dumps({"error": "No markdown content provided", "tables": []})
try:
print(f"[PageIndex] Starting table extraction from {len(markdown_text)} chars...")
# 1. Build the PageIndex Tree
tree = TreeIndex()
try:
tree.build_from_markdown(markdown_text)
print("[PageIndex] Tree index built successfully for table extraction.")
except Exception as e:
print(f"[PageIndex] Tree build error: {e}, using fallback.")
# 2. Initialize the LLM client
try:
client = get_llm_client(provider="nvidia")
model = get_model_name(provider="nvidia")
except Exception as e:
print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.")
try:
client = get_llm_client(provider="mistral")
model = get_model_name(provider="mistral")
except Exception as e2:
return json.dumps({"error": f"LLM client error: {str(e2)}", "tables": []})
# 3. Search for table-rich sections
table_query = """
Find all tables in the document including: Well Headers, Formation Tops, Casing Details,
Drilling Data, Directional Surveys, Core Analysis, Cementing Records, BHA records,
Cuttings Descriptions, and any other tabular data.
Extract ALL rows and columns from each table found.
"""
context = ""
try:
if hasattr(tree, 'reasoning_search'):
context = tree.reasoning_search(query=table_query, llm_client=client, model=model)
else:
# Fallback: use document directly
context = markdown_text[:15000] # First 15k chars
except Exception as e:
print(f"[PageIndex] Tree search error: {e}, using fallback.")
context = markdown_text[:15000]
if not context or len(context) < 100:
context = markdown_text[:15000]
# 4. Generate structured JSON tables
extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables AND convert structured paragraph data into tables from the provided document context.
CRITICAL INSTRUCTIONS - READ CAREFULLY:
1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize.
2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows.
3. **CONVERT PARAGRAPHS TO TABLES**: If you find formation tops, lithology data, or any structured data in text paragraphs (e.g., "Formation X encountered at 1000m depth"), CONVERT it into a proper table with columns and rows.
4. **COMPLETE EXTRACTION**: Count the rows in the source table and verify you extracted the same number.
5. **DO NOT SUMMARIZE**: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted.
6. **SCRAPE PARAGRAPHS**: Look for:
- Formation tops mentioned in text (e.g., "Eleana Formation at 2594 feet")
- Lithology descriptions with depths
- Drilling events with dates/depths
- Equipment lists in bullet points
- Any sequential data that can be tabulated
**O&G TABLE CATEGORIES TO EXTRACT (including from paragraphs):**
- Well Headers / Well Identification / Site Data
- Formation Tops / Lithology / Stratigraphy (LOOK IN TEXT PARAGRAPHS TOO!)
- Directional Survey / Well Path / Azimuth/Inclination data
- Casing Records / Casing Data / Tubing specifications
- Cementing Data / Cement Composition / Bond logs
- Drilling Fluids / Mud Properties / Fluid Management
- Core Analysis / Core Data / Petrophysics
- Sidewall Samples / SWC data
- Production Tests / DST / Pressure tests / Flow rates
- Perforation Data / Completion details
- Geophysical Logs / Wireline logs / Logging runs
- Equipment Lists / BHA / Drill string components
- Personnel / Company representatives / Supervisors
- Timelines / Drilling events / Days depths
- Cost data / AFE estimates
**PARAGRAPH-TO-TABLE CONVERSION EXAMPLES:**
If text says: "The Eleana Dolomite was encountered at 2,594 ft MD (2,594 ft TVD)..."
CREATE: {"title": "Formation Tops", "headers": ["Formation", "Depth_ft", "Depth_m"], "rows": [...]}
EXTRACTION REQUIREMENTS:
- Find ALL tables in the document
- CONVERT paragraph data describing formations, depths, lithology INTO tables
- For each table, extract:
- "title": A descriptive title for the table
- "headers": Array of column names
- "rows": Array of row objects - MUST INCLUDE ALL ROWS
- "page_number": The page number where this table appears
- **BE THOROUGH**: A typical completion report has 15-25+ separate tables. If you only found 3-5, you missed some. Scan paragraphs too!
Return VALID JSON ONLY in this exact format:
{
"tables": [
{
"title": "Well Header Information",
"headers": ["Well Name", "API Number", "Operator", "Location"],
"rows": [
{"Well Name": "OzAlpha-1", "API Number": "42-001", "Operator": "PetroCorp", "Location": "Texas"}
],
"page_number": 1
}
]
}
VERIFICATION STEP:
1. Count tables found in explicit table format
2. Count data found in paragraphs that could be tables
3. Total should be 15-25+ for a completion report
4. Before returning, verify you converted paragraph data to tables
Return ONLY the JSON, no markdown, no explanations, no code blocks."""
messages = [
{"role": "system", "content": extraction_prompt},
{"role": "user", "content": f"Document Context:\n{context}\n\nExtract all tables as JSON."}
]
print("[PageIndex] Sending table extraction request to LLM...")
response = client.chat.completions.create(
model=model,
messages=messages,
stream=False,
max_tokens=16384,
temperature=0
)
response_text = response.choices[0].message.content
print(f"[PageIndex] LLM response received: {len(response_text)} chars")
# Parse JSON from response - handle markdown code blocks
response_text = response_text.strip()
# Try multiple extraction strategies
data = None
# Strategy 1: Try direct JSON parse
try:
data = json.loads(response_text)
except json.JSONDecodeError:
pass
# Strategy 2: Extract JSON from markdown code block
if data is None:
code_block_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', response_text, re.DOTALL)
if code_block_match:
try:
data = json.loads(code_block_match.group(1))
except json.JSONDecodeError:
pass
# Strategy 3: Extract JSON object directly
if data is None:
json_match = re.search(r'\{[\s\S]*"tables"[\s\S]*\}', response_text)
if json_match:
try:
data = json.loads(json_match.group(0))
except json.JSONDecodeError:
pass
# Strategy 4: Look for any JSON-like structure
if data is None:
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(0))
except json.JSONDecodeError:
pass
if data and "tables" in data:
tables = data["tables"]
# Ensure each table has required fields
for table in tables:
if "page_number" not in table:
table["page_number"] = 1
if "source" not in table:
table["source"] = "PageIndex"
print(f"[PageIndex] Successfully extracted {len(tables)} tables.")
return json.dumps({"tables": tables})
# If no valid JSON found, return empty
print(f"[PageIndex] No valid JSON found in response. Raw preview: {response_text[:500]}")
return json.dumps({"tables": []})
except Exception as e:
print(f"[PageIndex] Table extraction error: {e}")
return json.dumps({"error": str(e), "tables": []})
def process_docling_and_chat(markdown_text, user_query, token, chat_history_json=None):
"""
Process document markdown and answer user query using PageIndex RAG.
Yields streaming updates for real-time feedback.
"""
start_time = time.time()
# Token validation
if token != REQUIRED_TOKEN:
yield "<<<STATUS: Error: Invalid Authentication Token.>>>>>"
return
if not markdown_text:
yield "<<<STATUS: Error: Please provide document markdown text.>>>>>"
return
if not user_query:
yield "<<<STATUS: Error: Please provide a query.>>>>>"
return
try:
# History parsing
chat_history = []
if chat_history_json:
try:
chat_history = json.loads(chat_history_json)
except Exception as e:
print(f"[PageIndex] Warning: Could not parse chat history: {e}")
reasoning_log = ""
yield "<<<STATUS: Initializing PageIndex RAG Engine...>>>"
# 1. Build the PageIndex Tree locally in the Space
reasoning_log += "<<<STATUS: Building semantic tree index from markdown...>>>\n"
yield reasoning_log
tree = TreeIndex()
try:
tree.build_from_markdown(markdown_text)
reasoning_log += f"<<<STATUS: Tree index built successfully.>>>\n"
yield reasoning_log
except Exception as e:
print(f"[PageIndex] Tree build error: {e}")
reasoning_log += f"<<<STATUS: Warning: Tree build had issues, using fallback.>>>\n"
yield reasoning_log
# 2. Initialize the LLM client
reasoning_log += "<<<STATUS: Initializing LLM client...>>>\n"
yield reasoning_log
try:
client = get_llm_client(provider="nvidia")
model = get_model_name(provider="nvidia")
reasoning_log += f"<<<STATUS: Using NVIDIA model: {model}>>>\n"
except Exception as e:
print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.")
try:
client = get_llm_client(provider="mistral")
model = get_model_name(provider="mistral")
reasoning_log += f"<<<STATUS: Using Mistral model: {model} (NVIDIA fallback)>>>\n"
except Exception as e2:
yield f"<<<STATUS: Error: Could not initialize any LLM client. {str(e2)}>>>"
return
yield reasoning_log
# 3. Perform Reasoning Search (Streamed)
reasoning_log += "<<<STATUS: Performing semantic tree search for relevant sections...>>>\n"
yield reasoning_log
context = ""
search_success = False
# Use stream method if available
if hasattr(tree, 'reasoning_search_stream'):
try:
for update in tree.reasoning_search_stream(user_query=user_query, llm_client=client, model=model):
if update.startswith("<<<STATUS:"):
reasoning_log += update + "\n"
yield reasoning_log
elif update.startswith("Error:"):
reasoning_log += f"<<<STATUS: Search warning: {update}>>>\n"
yield reasoning_log
else:
context = update
search_success = True
except Exception as e:
print(f"[PageIndex] Streaming search error: {e}")
reasoning_log += f"<<<STATUS: Warning: Streaming search failed, trying standard search...>>>\n"
yield reasoning_log
# Fallback to standard search if streaming failed or not available
if not search_success:
try:
reasoning_log += "<<<STATUS: Using standard reasoning search...>>>\n"
yield reasoning_log
context = tree.reasoning_search(query=user_query, llm_client=client, model=model)
search_success = True
except Exception as e:
print(f"[PageIndex] Standard search error: {e}")
# Use full document as context as last resort
context = markdown_text[:8000] # First 8000 chars
reasoning_log += f"<<<STATUS: Warning: Using document excerpt as context.>>>\n"
yield reasoning_log
if not context or context.strip() == "":
context = "No specific context found in document tree. Using full document."
# Include first and last part of document
context = markdown_text[:4000] + "\n\n...[MIDDLE SECTIONS OMITTED]...\n\n" + markdown_text[-4000:]
# 4. Final Answer Generation
reasoning_log += "<<<STATUS: Generating final answer with retrieved context...>>>\n"
yield reasoning_log
# Construct messages with history
messages = [
{"role": "system", "content": """You are a Senior Petroleum Engineer assistant.
Your goal is to extract precise technical data from the provided document context.
**Guidelines:**
1. **Tables**: If the user asks for data that can be tabulated (e.g., formation tops, casing, surveys, fluid props), **ALWAYS** format the output as a Markdown table.
2. **Completeness**: Extract ALL relevant data. Do NOT summarize or omit rows.
3. **Inference**: If data is text-based (e.g., "X formation at 1000m"), structure it into a table.
4. **No "Not Found"**: If you found related data, present that as the answer.
5. **Tone**: Technical, precise, no fluff.
6. **Charts**: If requested, visualize data using this JSON format:
```json:chart
{
"type": "line" | "bar" | "area" | "scatter",
"title": "Title",
"xAxis": "x_label",
"yAxis": "y_label",
"data": [{"x_label": 0, "y_label": 10}, ...]
}
```
"""}
]
# Add history
for msg in chat_history:
role = msg.get("role", "user")
content = msg.get("content", "")
messages.append({"role": role, "content": content})
messages.append({
"role": "user",
"content": f"Context from document:\n{context}\n\nUser Query: {user_query}\n\nIf the query requests tabular data, provide a complete Markdown Table with all rows."
})
# Generate streaming response
try:
response_stream = client.chat.completions.create(
model=model,
messages=messages,
stream=True,
max_tokens=8192,
temperature=0,
)
full_response_text = ""
for chunk in response_stream:
if chunk.choices[0].delta.content:
delta = chunk.choices[0].delta.content
full_response_text += delta
# Yield reasoning log + current response
yield reasoning_log + "\n" + "="*50 + "\nFINAL ANSWER:\n" + "="*50 + "\n" + full_response_text
elapsed = time.time() - start_time
print(f"[PageIndex] Request completed in {elapsed:.2f}s")
except Exception as e:
print(f"[PageIndex] LLM generation error: {e}")
yield reasoning_log + f"\n\nError generating response: {str(e)}"
except Exception as e:
error_msg = f"An error occurred: {str(e)}"
print(f"[PageIndex] {error_msg}")
yield f"<<<STATUS: {error_msg}>>>"
# Gradio UI setup
with gr.Blocks(title="Petromind AI - PageIndex RAG") as demo:
gr.Markdown("# Oil & Gas Report - PageIndex RAG")
gr.Markdown("Upload document content (markdown format) and ask questions to extract specific information using PageIndex reasoning.")
with gr.Tab("Chat / Query"):
with gr.Row():
with gr.Column(scale=1):
input_md = gr.Textbox(
label="Paste Docling Markdown Here",
lines=15,
placeholder="# Document Title\n\n## Section 1\nContent..."
)
with gr.Column(scale=1):
query = gr.Textbox(
label="What do you want to extract?",
placeholder="e.g., Extract all formation tops tables with depths"
)
token_input = gr.Textbox(
label="API Token",
placeholder="Enter access token",
type="password",
value="849ejdkf2Audjo2Jf3jdoirfjh"
)
history_json = gr.Textbox(visible=False, label="History JSON")
btn = gr.Button("Analyze", variant="primary")
output = gr.Textbox(label="Result", lines=15, interactive=False)
btn.click(
fn=process_docling_and_chat,
inputs=[input_md, query, token_input, history_json],
outputs=output,
api_name="process_docling_and_chat"
)
with gr.Tab("Table Extraction"):
with gr.Row():
with gr.Column(scale=1):
table_input_md = gr.Textbox(
label="Paste Docling Markdown Here",
lines=15,
placeholder="# Document Title\n\n## Section 1\nContent..."
)
with gr.Column(scale=1):
table_token_input = gr.Textbox(
label="API Token",
placeholder="Enter access token",
type="password",
value="849ejdkf2Audjo2Jf3jdoirfjh"
)
table_btn = gr.Button("Extract All Tables", variant="primary")
table_output = gr.Textbox(label="Extracted Tables (JSON)", lines=15, interactive=False)
table_btn.click(
fn=extract_tables_from_markdown,
inputs=[table_input_md, table_token_input],
outputs=table_output,
api_name="extract_tables"
)
if __name__ == "__main__":
# Enable queue for concurrency
demo.queue().launch(server_name="0.0.0.0", server_port=7860)
|