File size: 19,289 Bytes
385769a
 
48ed553
b95691d
277e2a5
385769a
 
 
b95691d
385769a
 
277e2a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59c1497
277e2a5
751241b
 
 
59c1497
 
 
 
 
 
 
 
 
751241b
59c1497
ad30edb
59c1497
ad30edb
 
 
 
 
 
 
 
 
 
 
 
59c1497
 
 
 
 
ad30edb
751241b
59c1497
 
751241b
277e2a5
59c1497
 
751241b
59c1497
751241b
 
277e2a5
 
 
 
 
 
 
 
 
 
 
 
 
 
751241b
59c1497
 
 
 
ad30edb
751241b
277e2a5
 
 
 
 
 
 
 
 
 
 
 
59c1497
 
277e2a5
 
 
 
 
59c1497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277e2a5
59c1497
 
277e2a5
 
 
 
 
 
 
48ed553
b95691d
 
 
 
 
 
 
385769a
b95691d
48ed553
385769a
 
b95691d
48ed553
385769a
b95691d
48ed553
385769a
 
48ed553
 
 
 
 
b95691d
 
48ed553
 
b95691d
48ed553
385769a
b95691d
48ed553
b95691d
385769a
b95691d
 
 
 
 
 
 
 
 
 
 
 
385769a
 
 
 
b95691d
385769a
b95691d
 
 
 
 
 
 
 
 
 
385769a
48ed553
b95691d
 
 
48ed553
b95691d
 
 
48ed553
b95691d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385769a
48ed553
b95691d
48ed553
 
 
 
 
fa24f2f
 
 
 
b95691d
 
 
 
 
bacee7b
 
 
 
 
 
 
 
 
48ed553
 
 
 
 
 
 
 
 
b95691d
 
 
 
48ed553
b95691d
 
 
 
 
 
 
59c1497
b95691d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385769a
 
b95691d
 
 
385769a
 
 
 
 
 
277e2a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385769a
277e2a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385769a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
import gradio as gr
import os
import json
import time
import re
from pageindex.core.tree_index import TreeIndex
from llm_config import get_llm_client, get_model_name

# Security: Check for APP_TOKEN env var
REQUIRED_TOKEN = os.getenv("APP_TOKEN", "849ejdkf2Audjo2Jf3jdoirfjh")


def extract_tables_from_markdown(markdown_text, token):
    """
    Dedicated function to extract all tables from the markdown document.
    Returns JSON array of table objects.
    """
    if token != REQUIRED_TOKEN:
        return json.dumps({"error": "Invalid Authentication Token", "tables": []})
    
    if not markdown_text:
        return json.dumps({"error": "No markdown content provided", "tables": []})
    
    try:
        print(f"[PageIndex] Starting table extraction from {len(markdown_text)} chars...")
        
        # 1. Build the PageIndex Tree
        tree = TreeIndex()
        try:
            tree.build_from_markdown(markdown_text)
            print("[PageIndex] Tree index built successfully for table extraction.")
        except Exception as e:
            print(f"[PageIndex] Tree build error: {e}, using fallback.")
        
        # 2. Initialize the LLM client
        try:
            client = get_llm_client(provider="nvidia")
            model = get_model_name(provider="nvidia")
        except Exception as e:
            print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.")
            try:
                client = get_llm_client(provider="mistral")
                model = get_model_name(provider="mistral")
            except Exception as e2:
                return json.dumps({"error": f"LLM client error: {str(e2)}", "tables": []})
        
        # 3. Search for table-rich sections
        table_query = """
        Find all tables in the document including: Well Headers, Formation Tops, Casing Details, 
        Drilling Data, Directional Surveys, Core Analysis, Cementing Records, BHA records, 
        Cuttings Descriptions, and any other tabular data.
        Extract ALL rows and columns from each table found.
        """
        
        context = ""
        try:
            if hasattr(tree, 'reasoning_search'):
                context = tree.reasoning_search(query=table_query, llm_client=client, model=model)
            else:
                # Fallback: use document directly
                context = markdown_text[:15000]  # First 15k chars
        except Exception as e:
            print(f"[PageIndex] Tree search error: {e}, using fallback.")
            context = markdown_text[:15000]
        
        if not context or len(context) < 100:
            context = markdown_text[:15000]
        
        # 4. Generate structured JSON tables
        extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables AND convert structured paragraph data into tables from the provided document context.

CRITICAL INSTRUCTIONS - READ CAREFULLY:
1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize.
2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows.
3. **CONVERT PARAGRAPHS TO TABLES**: If you find formation tops, lithology data, or any structured data in text paragraphs (e.g., "Formation X encountered at 1000m depth"), CONVERT it into a proper table with columns and rows.
4. **COMPLETE EXTRACTION**: Count the rows in the source table and verify you extracted the same number.
5. **DO NOT SUMMARIZE**: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted.
6. **SCRAPE PARAGRAPHS**: Look for:
   - Formation tops mentioned in text (e.g., "Eleana Formation at 2594 feet")
   - Lithology descriptions with depths
   - Drilling events with dates/depths
   - Equipment lists in bullet points
   - Any sequential data that can be tabulated

**O&G TABLE CATEGORIES TO EXTRACT (including from paragraphs):**
- Well Headers / Well Identification / Site Data
- Formation Tops / Lithology / Stratigraphy (LOOK IN TEXT PARAGRAPHS TOO!)
- Directional Survey / Well Path / Azimuth/Inclination data
- Casing Records / Casing Data / Tubing specifications
- Cementing Data / Cement Composition / Bond logs
- Drilling Fluids / Mud Properties / Fluid Management
- Core Analysis / Core Data / Petrophysics
- Sidewall Samples / SWC data
- Production Tests / DST / Pressure tests / Flow rates
- Perforation Data / Completion details
- Geophysical Logs / Wireline logs / Logging runs
- Equipment Lists / BHA / Drill string components
- Personnel / Company representatives / Supervisors
- Timelines / Drilling events / Days depths
- Cost data / AFE estimates

**PARAGRAPH-TO-TABLE CONVERSION EXAMPLES:**
If text says: "The Eleana Dolomite was encountered at 2,594 ft MD (2,594 ft TVD)..."
CREATE: {"title": "Formation Tops", "headers": ["Formation", "Depth_ft", "Depth_m"], "rows": [...]}

EXTRACTION REQUIREMENTS:
- Find ALL tables in the document
- CONVERT paragraph data describing formations, depths, lithology INTO tables
- For each table, extract:
   - "title": A descriptive title for the table
   - "headers": Array of column names
   - "rows": Array of row objects - MUST INCLUDE ALL ROWS
   - "page_number": The page number where this table appears
- **BE THOROUGH**: A typical completion report has 15-25+ separate tables. If you only found 3-5, you missed some. Scan paragraphs too!

Return VALID JSON ONLY in this exact format:

{
    "tables": [
        {
            "title": "Well Header Information",
            "headers": ["Well Name", "API Number", "Operator", "Location"],
            "rows": [
                {"Well Name": "OzAlpha-1", "API Number": "42-001", "Operator": "PetroCorp", "Location": "Texas"}
            ],
            "page_number": 1
        }
    ]
}

VERIFICATION STEP:
1. Count tables found in explicit table format
2. Count data found in paragraphs that could be tables
3. Total should be 15-25+ for a completion report
4. Before returning, verify you converted paragraph data to tables

Return ONLY the JSON, no markdown, no explanations, no code blocks."""

        messages = [
            {"role": "system", "content": extraction_prompt},
            {"role": "user", "content": f"Document Context:\n{context}\n\nExtract all tables as JSON."}
        ]
        
        print("[PageIndex] Sending table extraction request to LLM...")
        
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            stream=False,
            max_tokens=16384,
            temperature=0
        )
        
        response_text = response.choices[0].message.content
        print(f"[PageIndex] LLM response received: {len(response_text)} chars")
        
        # Parse JSON from response - handle markdown code blocks
        response_text = response_text.strip()
        
        # Try multiple extraction strategies
        data = None
        
        # Strategy 1: Try direct JSON parse
        try:
            data = json.loads(response_text)
        except json.JSONDecodeError:
            pass
        
        # Strategy 2: Extract JSON from markdown code block
        if data is None:
            code_block_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', response_text, re.DOTALL)
            if code_block_match:
                try:
                    data = json.loads(code_block_match.group(1))
                except json.JSONDecodeError:
                    pass
        
        # Strategy 3: Extract JSON object directly
        if data is None:
            json_match = re.search(r'\{[\s\S]*"tables"[\s\S]*\}', response_text)
            if json_match:
                try:
                    data = json.loads(json_match.group(0))
                except json.JSONDecodeError:
                    pass
        
        # Strategy 4: Look for any JSON-like structure
        if data is None:
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                try:
                    data = json.loads(json_match.group(0))
                except json.JSONDecodeError:
                    pass
        
        if data and "tables" in data:
            tables = data["tables"]
            # Ensure each table has required fields
            for table in tables:
                if "page_number" not in table:
                    table["page_number"] = 1
                if "source" not in table:
                    table["source"] = "PageIndex"
            print(f"[PageIndex] Successfully extracted {len(tables)} tables.")
            return json.dumps({"tables": tables})
        
        # If no valid JSON found, return empty
        print(f"[PageIndex] No valid JSON found in response. Raw preview: {response_text[:500]}")
        return json.dumps({"tables": []})
        
    except Exception as e:
        print(f"[PageIndex] Table extraction error: {e}")
        return json.dumps({"error": str(e), "tables": []})


def process_docling_and_chat(markdown_text, user_query, token, chat_history_json=None):
    """
    Process document markdown and answer user query using PageIndex RAG.
    Yields streaming updates for real-time feedback.
    """
    start_time = time.time()
    
    # Token validation
    if token != REQUIRED_TOKEN:
        yield "<<<STATUS: Error: Invalid Authentication Token.>>>>>"
        return

    if not markdown_text:
        yield "<<<STATUS: Error: Please provide document markdown text.>>>>>"
        return
    if not user_query:
        yield "<<<STATUS: Error: Please provide a query.>>>>>"
        return

    try:
        # History parsing
        chat_history = []
        if chat_history_json:
            try:
                chat_history = json.loads(chat_history_json)
            except Exception as e:
                print(f"[PageIndex] Warning: Could not parse chat history: {e}")

        reasoning_log = ""
        yield "<<<STATUS: Initializing PageIndex RAG Engine...>>>"

        # 1. Build the PageIndex Tree locally in the Space
        reasoning_log += "<<<STATUS: Building semantic tree index from markdown...>>>\n"
        yield reasoning_log
        
        tree = TreeIndex()
        try:
            tree.build_from_markdown(markdown_text)
            reasoning_log += f"<<<STATUS: Tree index built successfully.>>>\n"
            yield reasoning_log
        except Exception as e:
            print(f"[PageIndex] Tree build error: {e}")
            reasoning_log += f"<<<STATUS: Warning: Tree build had issues, using fallback.>>>\n"
            yield reasoning_log
        
        # 2. Initialize the LLM client
        reasoning_log += "<<<STATUS: Initializing LLM client...>>>\n"
        yield reasoning_log
        
        try:
            client = get_llm_client(provider="nvidia")
            model = get_model_name(provider="nvidia")
            reasoning_log += f"<<<STATUS: Using NVIDIA model: {model}>>>\n"
        except Exception as e:
            print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.")
            try:
                client = get_llm_client(provider="mistral")
                model = get_model_name(provider="mistral")
                reasoning_log += f"<<<STATUS: Using Mistral model: {model} (NVIDIA fallback)>>>\n"
            except Exception as e2:
                yield f"<<<STATUS: Error: Could not initialize any LLM client. {str(e2)}>>>"
                return
        
        yield reasoning_log
        
        # 3. Perform Reasoning Search (Streamed)
        reasoning_log += "<<<STATUS: Performing semantic tree search for relevant sections...>>>\n"
        yield reasoning_log
        
        context = ""
        search_success = False
        
        # Use stream method if available
        if hasattr(tree, 'reasoning_search_stream'):
            try:
                for update in tree.reasoning_search_stream(user_query=user_query, llm_client=client, model=model):
                    if update.startswith("<<<STATUS:"):
                        reasoning_log += update + "\n"
                        yield reasoning_log
                    elif update.startswith("Error:"):
                        reasoning_log += f"<<<STATUS: Search warning: {update}>>>\n"
                        yield reasoning_log
                    else:
                        context = update
                        search_success = True
            except Exception as e:
                print(f"[PageIndex] Streaming search error: {e}")
                reasoning_log += f"<<<STATUS: Warning: Streaming search failed, trying standard search...>>>\n"
                yield reasoning_log
        
        # Fallback to standard search if streaming failed or not available
        if not search_success:
            try:
                reasoning_log += "<<<STATUS: Using standard reasoning search...>>>\n"
                yield reasoning_log
                context = tree.reasoning_search(query=user_query, llm_client=client, model=model)
                search_success = True
            except Exception as e:
                print(f"[PageIndex] Standard search error: {e}")
                # Use full document as context as last resort
                context = markdown_text[:8000]  # First 8000 chars
                reasoning_log += f"<<<STATUS: Warning: Using document excerpt as context.>>>\n"
                yield reasoning_log
        
        if not context or context.strip() == "":
            context = "No specific context found in document tree. Using full document."
            # Include first and last part of document
            context = markdown_text[:4000] + "\n\n...[MIDDLE SECTIONS OMITTED]...\n\n" + markdown_text[-4000:]
        
        # 4. Final Answer Generation
        reasoning_log += "<<<STATUS: Generating final answer with retrieved context...>>>\n"
        yield reasoning_log

        # Construct messages with history
        messages = [
            {"role": "system", "content": """You are a Senior Petroleum Engineer assistant.
Your goal is to extract precise technical data from the provided document context.

**Guidelines:**
1. **Tables**: If the user asks for data that can be tabulated (e.g., formation tops, casing, surveys, fluid props), **ALWAYS** format the output as a Markdown table.
2. **Completeness**: Extract ALL relevant data. Do NOT summarize or omit rows.
3. **Inference**: If data is text-based (e.g., "X formation at 1000m"), structure it into a table.
4. **No "Not Found"**: If you found related data, present that as the answer.
5. **Tone**: Technical, precise, no fluff.
6. **Charts**: If requested, visualize data using this JSON format:
```json:chart
{
  "type": "line" | "bar" | "area" | "scatter",
  "title": "Title",
  "xAxis": "x_label",
  "yAxis": "y_label",
  "data": [{"x_label": 0, "y_label": 10}, ...]
}
```
"""}
        ]
        
        # Add history
        for msg in chat_history:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            messages.append({"role": role, "content": content})
            
        messages.append({
            "role": "user", 
            "content": f"Context from document:\n{context}\n\nUser Query: {user_query}\n\nIf the query requests tabular data, provide a complete Markdown Table with all rows."
        })

        # Generate streaming response
        try:
            response_stream = client.chat.completions.create(
                model=model, 
                messages=messages,
                stream=True,
                max_tokens=8192,
                temperature=0,
            )
            
            full_response_text = ""
            for chunk in response_stream:
                if chunk.choices[0].delta.content:
                    delta = chunk.choices[0].delta.content
                    full_response_text += delta
                    # Yield reasoning log + current response
                    yield reasoning_log + "\n" + "="*50 + "\nFINAL ANSWER:\n" + "="*50 + "\n" + full_response_text
            
            elapsed = time.time() - start_time
            print(f"[PageIndex] Request completed in {elapsed:.2f}s")
            
        except Exception as e:
            print(f"[PageIndex] LLM generation error: {e}")
            yield reasoning_log + f"\n\nError generating response: {str(e)}"
        
    except Exception as e:
        error_msg = f"An error occurred: {str(e)}"
        print(f"[PageIndex] {error_msg}")
        yield f"<<<STATUS: {error_msg}>>>"

# Gradio UI setup
with gr.Blocks(title="Petromind AI - PageIndex RAG") as demo:
    gr.Markdown("# Oil & Gas Report - PageIndex RAG")
    gr.Markdown("Upload document content (markdown format) and ask questions to extract specific information using PageIndex reasoning.")
    
    with gr.Tab("Chat / Query"):
        with gr.Row():
            with gr.Column(scale=1):
                input_md = gr.Textbox(
                    label="Paste Docling Markdown Here", 
                    lines=15, 
                    placeholder="# Document Title\n\n## Section 1\nContent..."
                )
            with gr.Column(scale=1):
                query = gr.Textbox(
                    label="What do you want to extract?", 
                    placeholder="e.g., Extract all formation tops tables with depths"
                )
                token_input = gr.Textbox(
                    label="API Token", 
                    placeholder="Enter access token", 
                    type="password",
                    value="849ejdkf2Audjo2Jf3jdoirfjh"
                )
                history_json = gr.Textbox(visible=False, label="History JSON")
                btn = gr.Button("Analyze", variant="primary")
                output = gr.Textbox(label="Result", lines=15, interactive=False)
        
        btn.click(
            fn=process_docling_and_chat, 
            inputs=[input_md, query, token_input, history_json], 
            outputs=output, 
            api_name="process_docling_and_chat"
        )
    
    with gr.Tab("Table Extraction"):
        with gr.Row():
            with gr.Column(scale=1):
                table_input_md = gr.Textbox(
                    label="Paste Docling Markdown Here", 
                    lines=15, 
                    placeholder="# Document Title\n\n## Section 1\nContent..."
                )
            with gr.Column(scale=1):
                table_token_input = gr.Textbox(
                    label="API Token", 
                    placeholder="Enter access token", 
                    type="password",
                    value="849ejdkf2Audjo2Jf3jdoirfjh"
                )
                table_btn = gr.Button("Extract All Tables", variant="primary")
                table_output = gr.Textbox(label="Extracted Tables (JSON)", lines=15, interactive=False)
        
        table_btn.click(
            fn=extract_tables_from_markdown, 
            inputs=[table_input_md, table_token_input], 
            outputs=table_output, 
            api_name="extract_tables"
        )

if __name__ == "__main__":
    # Enable queue for concurrency
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)