Spaces:

gauthamnairy
/

PageIndexAPI

Sleeping

File size: 19,289 Bytes

import gradio as gr
import os
import json
import time
import re
from pageindex.core.tree_index import TreeIndex
from llm_config import get_llm_client, get_model_name

# Security: Check for APP_TOKEN env var
REQUIRED_TOKEN = os.getenv("APP_TOKEN", "849ejdkf2Audjo2Jf3jdoirfjh")


def extract_tables_from_markdown(markdown_text, token):
    """
    Dedicated function to extract all tables from the markdown document.
    Returns JSON array of table objects.
    """
    if token != REQUIRED_TOKEN:
        return json.dumps({"error": "Invalid Authentication Token", "tables": []})
    
    if not markdown_text:
        return json.dumps({"error": "No markdown content provided", "tables": []})
    
    try:
        print(f"[PageIndex] Starting table extraction from {len(markdown_text)} chars...")
        
        # 1. Build the PageIndex Tree
        tree = TreeIndex()
        try:
            tree.build_from_markdown(markdown_text)
            print("[PageIndex] Tree index built successfully for table extraction.")
        except Exception as e:
            print(f"[PageIndex] Tree build error: {e}, using fallback.")
        
        # 2. Initialize the LLM client
        try:
            client = get_llm_client(provider="nvidia")
            model = get_model_name(provider="nvidia")
        except Exception as e:
            print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.")
            try:
                client = get_llm_client(provider="mistral")
                model = get_model_name(provider="mistral")
            except Exception as e2:
                return json.dumps({"error": f"LLM client error: {str(e2)}", "tables": []})
        
        # 3. Search for table-rich sections
        table_query = """
        Find all tables in the document including: Well Headers, Formation Tops, Casing Details, 
        Drilling Data, Directional Surveys, Core Analysis, Cementing Records, BHA records, 
        Cuttings Descriptions, and any other tabular data.
        Extract ALL rows and columns from each table found.
        """
        
        context = ""
        try:
            if hasattr(tree, 'reasoning_search'):
                context = tree.reasoning_search(query=table_query, llm_client=client, model=model)
            else:
                # Fallback: use document directly
                context = markdown_text[:15000]  # First 15k chars
        except Exception as e:
            print(f"[PageIndex] Tree search error: {e}, using fallback.")
            context = markdown_text[:15000]
        
        if not context or len(context) < 100:
            context = markdown_text[:15000]
        
        # 4. Generate structured JSON tables
        extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables AND convert structured paragraph data into tables from the provided document context.

CRITICAL INSTRUCTIONS - READ CAREFULLY:
1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize.
2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows.
3. **CONVERT PARAGRAPHS TO TABLES**: If you find formation tops, lithology data, or any structured data in text paragraphs (e.g., "Formation X encountered at 1000m depth"), CONVERT it into a proper table with columns and rows.
4. **COMPLETE EXTRACTION**: Count the rows in the source table and verify you extracted the same number.
5. **DO NOT SUMMARIZE**: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted.
6. **SCRAPE PARAGRAPHS**: Look for:
   - Formation tops mentioned in text (e.g., "Eleana Formation at 2594 feet")
   - Lithology descriptions with depths
   - Drilling events with dates/depths
   - Equipment lists in bullet points
   - Any sequential data that can be tabulated

**O&G TABLE CATEGORIES TO EXTRACT (including from paragraphs):**
- Well Headers / Well Identification / Site Data
- Formation Tops / Lithology / Stratigraphy (LOOK IN TEXT PARAGRAPHS TOO!)
- Directional Survey / Well Path / Azimuth/Inclination data
- Casing Records / Casing Data / Tubing specifications
- Cementing Data / Cement Composition / Bond logs
- Drilling Fluids / Mud Properties / Fluid Management
- Core Analysis / Core Data / Petrophysics
- Sidewall Samples / SWC data
- Production Tests / DST / Pressure tests / Flow rates
- Perforation Data / Completion details
- Geophysical Logs / Wireline logs / Logging runs
- Equipment Lists / BHA / Drill string components
- Personnel / Company representatives / Supervisors
- Timelines / Drilling events / Days depths
- Cost data / AFE estimates

**PARAGRAPH-TO-TABLE CONVERSION EXAMPLES:**
If text says: "The Eleana Dolomite was encountered at 2,594 ft MD (2,594 ft TVD)..."
CREATE: {"title": "Formation Tops", "headers": ["Formation", "Depth_ft", "Depth_m"], "rows": [...]}

EXTRACTION REQUIREMENTS:
- Find ALL tables in the document
- CONVERT paragraph data describing formations, depths, lithology INTO tables
- For each table, extract:
   - "title": A descriptive title for the table
   - "headers": Array of column names
   - "rows": Array of row objects - MUST INCLUDE ALL ROWS
   - "page_number": The page number where this table appears
- **BE THOROUGH**: A typical completion report has 15-25+ separate tables. If you only found 3-5, you missed some. Scan paragraphs too!

Return VALID JSON ONLY in this exact format:

{
    "tables": [
        {
            "title": "Well Header Information",
            "headers": ["Well Name", "API Number", "Operator", "Location"],
            "rows": [
                {"Well Name": "OzAlpha-1", "API Number": "42-001", "Operator": "PetroCorp", "Location": "Texas"}
            ],
            "page_number": 1
        }
    ]
}

VERIFICATION STEP:
1. Count tables found in explicit table format
2. Count data found in paragraphs that could be tables
3. Total should be 15-25+ for a completion report
4. Before returning, verify you converted paragraph data to tables

Return ONLY the JSON, no markdown, no explanations, no code blocks."""

        messages = [
            {"role": "system", "content": extraction_prompt},
            {"role": "user", "content": f"Document Context:\n{context}\n\nExtract all tables as JSON."}
        ]
        
        print("[PageIndex] Sending table extraction request to LLM...")
        
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            stream=False,
            max_tokens=16384,
            temperature=0
        )
        
        response_text = response.choices[0].message.content
        print(f"[PageIndex] LLM response received: {len(response_text)} chars")
        
        # Parse JSON from response - handle markdown code blocks
        response_text = response_text.strip()
        
        # Try multiple extraction strategies
        data = None
        
        # Strategy 1: Try direct JSON parse
        try:
            data = json.loads(response_text)
        except json.JSONDecodeError:
            pass
        
        # Strategy 2: Extract JSON from markdown code block
        if data is None:
            code_block_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', response_text, re.DOTALL)
            if code_block_match:
                try:
                    data = json.loads(code_block_match.group(1))
                except json.JSONDecodeError:
                    pass
        
        # Strategy 3: Extract JSON object directly
        if data is None:
            json_match = re.search(r'\{[\s\S]*"tables"[\s\S]*\}', response_text)
            if json_match:
                try:
                    data = json.loads(json_match.group(0))
                except json.JSONDecodeError:
                    pass
        
        # Strategy 4: Look for any JSON-like structure
        if data is None:
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                try:
                    data = json.loads(json_match.group(0))
                except json.JSONDecodeError:
                    pass
        
        if data and "tables" in data:
            tables = data["tables"]
            # Ensure each table has required fields
            for table in tables:
                if "page_number" not in table:
                    table["page_number"] = 1
                if "source" not in table:
                    table["source"] = "PageIndex"
            print(f"[PageIndex] Successfully extracted {len(tables)} tables.")
            return json.dumps({"tables": tables})
        
        # If no valid JSON found, return empty
        print(f"[PageIndex] No valid JSON found in response. Raw preview: {response_text[:500]}")
        return json.dumps({"tables": []})
        
    except Exception as e:
        print(f"[PageIndex] Table extraction error: {e}")
        return json.dumps({"error": str(e), "tables": []})


def process_docling_and_chat(markdown_text, user_query, token, chat_history_json=None):
    """
    Process document markdown and answer user query using PageIndex RAG.
    Yields streaming updates for real-time feedback.
    """
    start_time = time.time()
    
    # Token validation
    if token != REQUIRED_TOKEN:
        yield "<<<STATUS: Error: Invalid Authentication Token.>>>>>"
        return

    if not markdown_text:
        yield "<<<STATUS: Error: Please provide document markdown text.>>>>>"
        return
    if not user_query:
        yield "<<<STATUS: Error: Please provide a query.>>>>>"
        return

    try:
        # History parsing
        chat_history = []
        if chat_history_json:
            try:
                chat_history = json.loads(chat_history_json)
            except Exception as e:
                print(f"[PageIndex] Warning: Could not parse chat history: {e}")

        reasoning_log = ""
        yield "<<<STATUS: Initializing PageIndex RAG Engine...>>>"

        # 1. Build the PageIndex Tree locally in the Space
        reasoning_log += "<<<STATUS: Building semantic tree index from markdown...>>>\n"
        yield reasoning_log
        
        tree = TreeIndex()
        try:
            tree.build_from_markdown(markdown_text)
            reasoning_log += f"<<<STATUS: Tree index built successfully.>>>\n"
            yield reasoning_log
        except Exception as e:
            print(f"[PageIndex] Tree build error: {e}")
            reasoning_log += f"<<<STATUS: Warning: Tree build had issues, using fallback.>>>\n"
            yield reasoning_log
        
        # 2. Initialize the LLM client
        reasoning_log += "<<<STATUS: Initializing LLM client...>>>\n"
        yield reasoning_log
        
        try:
            client = get_llm_client(provider="nvidia")
            model = get_model_name(provider="nvidia")
            reasoning_log += f"<<<STATUS: Using NVIDIA model: {model}>>>\n"
        except Exception as e:
            print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.")
            try:
                client = get_llm_client(provider="mistral")
                model = get_model_name(provider="mistral")
                reasoning_log += f"<<<STATUS: Using Mistral model: {model} (NVIDIA fallback)>>>\n"
            except Exception as e2:
                yield f"<<<STATUS: Error: Could not initialize any LLM client. {str(e2)}>>>"
                return
        
        yield reasoning_log
        
        # 3. Perform Reasoning Search (Streamed)
        reasoning_log += "<<<STATUS: Performing semantic tree search for relevant sections...>>>\n"
        yield reasoning_log
        
        context = ""
        search_success = False
        
        # Use stream method if available
        if hasattr(tree, 'reasoning_search_stream'):
            try:
                for update in tree.reasoning_search_stream(user_query=user_query, llm_client=client, model=model):
                    if update.startswith("<<<STATUS:"):
                        reasoning_log += update + "\n"
                        yield reasoning_log
                    elif update.startswith("Error:"):
                        reasoning_log += f"<<<STATUS: Search warning: {update}>>>\n"
                        yield reasoning_log
                    else:
                        context = update
                        search_success = True
            except Exception as e:
                print(f"[PageIndex] Streaming search error: {e}")
                reasoning_log += f"<<<STATUS: Warning: Streaming search failed, trying standard search...>>>\n"
                yield reasoning_log
        
        # Fallback to standard search if streaming failed or not available
        if not search_success:
            try:
                reasoning_log += "<<<STATUS: Using standard reasoning search...>>>\n"
                yield reasoning_log
                context = tree.reasoning_search(query=user_query, llm_client=client, model=model)
                search_success = True
            except Exception as e:
                print(f"[PageIndex] Standard search error: {e}")
                # Use full document as context as last resort
                context = markdown_text[:8000]  # First 8000 chars
                reasoning_log += f"<<<STATUS: Warning: Using document excerpt as context.>>>\n"
                yield reasoning_log
        
        if not context or context.strip() == "":
            context = "No specific context found in document tree. Using full document."
            # Include first and last part of document
            context = markdown_text[:4000] + "\n\n...[MIDDLE SECTIONS OMITTED]...\n\n" + markdown_text[-4000:]
        
        # 4. Final Answer Generation
        reasoning_log += "<<<STATUS: Generating final answer with retrieved context...>>>\n"
        yield reasoning_log

        # Construct messages with history
        messages = [
            {"role": "system", "content": """You are a Senior Petroleum Engineer assistant.
Your goal is to extract precise technical data from the provided document context.

**Guidelines:**
1. **Tables**: If the user asks for data that can be tabulated (e.g., formation tops, casing, surveys, fluid props), **ALWAYS** format the output as a Markdown table.
2. **Completeness**: Extract ALL relevant data. Do NOT summarize or omit rows.
3. **Inference**: If data is text-based (e.g., "X formation at 1000m"), structure it into a table.
4. **No "Not Found"**: If you found related data, present that as the answer.
5. **Tone**: Technical, precise, no fluff.
6. **Charts**: If requested, visualize data using this JSON format:
```json:chart
{
  "type": "line" | "bar" | "area" | "scatter",
  "title": "Title",
  "xAxis": "x_label",
  "yAxis": "y_label",
  "data": [{"x_label": 0, "y_label": 10}, ...]
}
```
"""}
        ]
        
        # Add history
        for msg in chat_history:
            role = msg.get("role", "user")
            content = msg.get("content", "")
            messages.append({"role": role, "content": content})
            
        messages.append({
            "role": "user", 
            "content": f"Context from document:\n{context}\n\nUser Query: {user_query}\n\nIf the query requests tabular data, provide a complete Markdown Table with all rows."
        })

        # Generate streaming response
        try:
            response_stream = client.chat.completions.create(
                model=model, 
                messages=messages,
                stream=True,
                max_tokens=8192,
                temperature=0,
            )
            
            full_response_text = ""
            for chunk in response_stream:
                if chunk.choices[0].delta.content:
                    delta = chunk.choices[0].delta.content
                    full_response_text += delta
                    # Yield reasoning log + current response
                    yield reasoning_log + "\n" + "="*50 + "\nFINAL ANSWER:\n" + "="*50 + "\n" + full_response_text
            
            elapsed = time.time() - start_time
            print(f"[PageIndex] Request completed in {elapsed:.2f}s")
            
        except Exception as e:
            print(f"[PageIndex] LLM generation error: {e}")
            yield reasoning_log + f"\n\nError generating response: {str(e)}"
        
    except Exception as e:
        error_msg = f"An error occurred: {str(e)}"
        print(f"[PageIndex] {error_msg}")
        yield f"<<<STATUS: {error_msg}>>>"

# Gradio UI setup
with gr.Blocks(title="Petromind AI - PageIndex RAG") as demo:
    gr.Markdown("# Oil & Gas Report - PageIndex RAG")
    gr.Markdown("Upload document content (markdown format) and ask questions to extract specific information using PageIndex reasoning.")
    
    with gr.Tab("Chat / Query"):
        with gr.Row():
            with gr.Column(scale=1):
                input_md = gr.Textbox(
                    label="Paste Docling Markdown Here", 
                    lines=15, 
                    placeholder="# Document Title\n\n## Section 1\nContent..."
                )
            with gr.Column(scale=1):
                query = gr.Textbox(
                    label="What do you want to extract?", 
                    placeholder="e.g., Extract all formation tops tables with depths"
                )
                token_input = gr.Textbox(
                    label="API Token", 
                    placeholder="Enter access token", 
                    type="password",
                    value="849ejdkf2Audjo2Jf3jdoirfjh"
                )
                history_json = gr.Textbox(visible=False, label="History JSON")
                btn = gr.Button("Analyze", variant="primary")
                output = gr.Textbox(label="Result", lines=15, interactive=False)
        
        btn.click(
            fn=process_docling_and_chat, 
            inputs=[input_md, query, token_input, history_json], 
            outputs=output, 
            api_name="process_docling_and_chat"
        )
    
    with gr.Tab("Table Extraction"):
        with gr.Row():
            with gr.Column(scale=1):
                table_input_md = gr.Textbox(
                    label="Paste Docling Markdown Here", 
                    lines=15, 
                    placeholder="# Document Title\n\n## Section 1\nContent..."
                )
            with gr.Column(scale=1):
                table_token_input = gr.Textbox(
                    label="API Token", 
                    placeholder="Enter access token", 
                    type="password",
                    value="849ejdkf2Audjo2Jf3jdoirfjh"
                )
                table_btn = gr.Button("Extract All Tables", variant="primary")
                table_output = gr.Textbox(label="Extracted Tables (JSON)", lines=15, interactive=False)
        
        table_btn.click(
            fn=extract_tables_from_markdown, 
            inputs=[table_input_md, table_token_input], 
            outputs=table_output, 
            api_name="extract_tables"
        )

if __name__ == "__main__":
    # Enable queue for concurrency
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)