Spaces:

build-small-hackathon
/

Structured-Data-Rescuer

Running

File size: 14,603 Bytes

import gradio as gr
import json
import os
import csv
import tempfile
from huggingface_hub import InferenceClient

# Replace this with your exact model repo ID
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" 

# Securely load the Hugging Face token from Space secrets
hf_token = os.environ.get("HF_TOKEN")

# Initialize the HF inference client with the token
client = InferenceClient(model=MODEL_ID, token=hf_token)

# -------------------------
# Custom CSS Styling
# -------------------------
custom_css = """

.hero-container {

    background: linear-gradient(135deg, #6366f1 0%, #14b8a6 100%);

    padding: 2.5rem;

    border-radius: 20px;

    color: white;

    margin-bottom: 2rem;

    box-shadow: 0 10px 25px -5px rgba(99, 102, 241, 0.2);

}

.hero-container h1 {

    color: white !important;

    font-size: 2.5rem !important;

    font-weight: 800 !important;

    margin-bottom: 0.5rem;

    text-shadow: 0 2px 4px rgba(0,0,0,0.1);

}

.hero-container p {

    color: rgba(255, 255, 255, 0.9) !important;

    font-size: 1.1rem !important;

}

.primary-btn {

    background: linear-gradient(90deg, #6366f1 0%, #14b8a6 100%) !important;

    border: none !important;

    color: white !important;

    font-weight: 600 !important;

    border-radius: 10px !important;

    transition: all 0.3s ease !important;

    padding: 12px 24px !important;

}

.primary-btn:hover {

    transform: translateY(-2px);

    box-shadow: 0 8px 20px -5px rgba(99, 102, 241, 0.4);

}

.secondary-btn {

    border-radius: 10px !important;

    font-weight: 600 !important;

}

.feedback-card {

    border-left: 4px solid #6366f1;

    background-color: rgba(99, 102, 241, 0.05);

}

"""

# -------------------------
# Helper & Extraction Logic
# -------------------------
def generate_kpi_html(structured_data):
    """Generates modern, responsive KPI metrics cards dynamically based on JSON data."""
    if not structured_data or "error" in structured_data:
        return """

        <div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>

            Await extraction to generate KPI metrics...

        </div>

        """
    
    cards_html = ""
    if isinstance(structured_data, dict):
        # Pick the top 4 attributes to show as metrics
        items = list(structured_data.items())[:4]
        for key, val in items:
            # Clean up the key label
            display_key = str(key).replace("_", " ").replace("-", " ").title()
            
            # Format list value representation
            if isinstance(val, list):
                display_val = ", ".join(map(str, val))
            else:
                display_val = str(val)
            
            # Truncate if string is too long for the card layout
            if len(display_val) > 40:
                display_val = display_val[:37] + "..."
                
            # Dynamic highlight accents based on field types
            accent_color = "#6366f1" # default Indigo
            if any(x in display_key.lower() for x in ["price", "total", "amount", "cost", "revenue", "budget"]):
                accent_color = "#10b981" # Emerald for cash/costs
            elif any(x in display_key.lower() for x in ["date", "deadline", "due", "time"]):
                accent_color = "#f59e0b" # Amber for dates/reminders
            elif any(x in display_key.lower() for x in ["status", "priority", "importance"]):
                accent_color = "#ef4444" # Crimson for status/alerts
                
            cards_html += f"""

            <div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid {accent_color}; min-width: 140px; flex: 1;'>

                <div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>{display_key}</div>

                <div style='font-size: 1.05rem; color: var(--body-text-color, #111827); font-weight: 800; word-break: break-word;'>{display_val}</div>

            </div>

            """
    elif isinstance(structured_data, list):
        # Summary KPI for array data structures
        cards_html = f"""

        <div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid #6366f1; min-width: 140px; flex: 1;'>

            <div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>Total Records Found</div>

            <div style='font-size: 1.5rem; color: var(--body-text-color, #111827); font-weight: 800;'>{len(structured_data)}</div>

        </div>

        """
        
    return f"""

    <div style='display: flex; flex-wrap: wrap; gap: 0.75rem; margin-bottom: 1rem; width: 100%;'>

        {cards_html}

    </div>

    """

def extract_data(raw_text, fields_to_extract):
    if not hf_token:
        err_state = {"error": "HF_TOKEN secret is missing. Please add your Hugging Face Access Token to the Space Secrets."}
        return err_state, [["Error", "HF_TOKEN missing"]], generate_kpi_html(err_state)
        
    if not raw_text.strip() or not fields_to_extract.strip():
        err_state = {"error": "Please provide both raw text and fields to extract."}
        return err_state, [["Error", "Incomplete inputs"]], generate_kpi_html(err_state)

    # Construct the system instruction
    system_prompt = (
        "You are an expert data extraction assistant. Your job is to extract specific "
        "information from messy, unstructured text and output it as clean, valid JSON.\n"
        "Rules:\n"
        "1. Only extract the fields requested.\n"
        "2. If a field is not found in the text, return 'null' for that field.\n"
        "3. Output ONLY a raw JSON object. Do not include markdown formatting, backticks, or conversational text."
    )

    user_prompt = f"Fields to extract:\n{fields_to_extract}\n\nUnstructured Text:\n{raw_text}"

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    try:
        # Call the model via the chat completion API
        response = client.chat_completion(
            messages=messages,
            max_tokens=1024,
            temperature=0.1, 
        )
        
        output_text = response.choices[0].message.content.strip()

        # Fallback: Safely strip markdown code blocks without regular expressions
        cleaned_text = output_text
        if cleaned_text.startswith("```"):
            lines = cleaned_text.splitlines()
            if len(lines) >= 2:
                if lines[0].startswith("```"):
                    lines = lines[1:]
                if lines and lines[-1].strip() == "```":
                    lines = lines[:-1]
                cleaned_text = "\n".join(lines).strip()

        # Parse the text into an actual JSON dictionary
        structured_data = json.loads(cleaned_text)
        
        # Convert JSON structure to a displayable 2D list for the Table view
        table_data = []
        if isinstance(structured_data, dict):
            for k, v in structured_data.items():
                val_str = ", ".join(map(str, v)) if isinstance(v, list) else str(v)
                table_data.append([k, val_str])
        elif isinstance(structured_data, list):
            for idx, item in enumerate(structured_data):
                table_data.append([f"Item {idx + 1}", str(item)])
                
        return structured_data, table_data, generate_kpi_html(structured_data)

    except json.JSONDecodeError:
        error_dict = {
            "error": "The model failed to return valid JSON. It returned this instead:",
            "raw_output": output_text
        }
        return error_dict, [["Error", "Invalid JSON parsed"]], generate_kpi_html(error_dict)
    except Exception as e:
        error_msg = str(e)
        if "model_not_found" in error_msg or "does not exist" in error_msg:
            err_dict = {
                "error": f"The model '{MODEL_ID}' was not found on Hugging Face.",
                "troubleshooting": [
                    "1. Check your Hugging Face repo for typos (case-sensitive).",
                    "2. Verify HF_TOKEN secret read permissions.",
                    "3. GGUF or LoRA adapter models are not directly supported by the Serverless API."
                ]
            }
            return err_dict, [["Connection Error", "Model Not Found"]], generate_kpi_html(err_dict)
        err_state = {"error": error_msg}
        return err_state, [["Error", error_msg]], generate_kpi_html(err_state)

def generate_csv(json_data):
    """Converts the JSON output into a downloadable CSV file."""
    if not json_data or "error" in json_data:
        return None
    
    if isinstance(json_data, dict):
        data_list = [json_data]
    elif isinstance(json_data, list):
        data_list = json_data
    else:
        return None

    # Create a secure temporary file to hold the CSV
    temp_dir = tempfile.mkdtemp()
    csv_path = os.path.join(temp_dir, "extracted_data.csv")
    
    try:
        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
            headers = set()
            for item in data_list:
                if isinstance(item, dict):
                    headers.update(item.keys())
            headers = list(headers)
            
            if not headers:
                return None

            writer = csv.DictWriter(f, fieldnames=headers)
            writer.writeheader()
            
            for item in data_list:
                if isinstance(item, dict):
                    flat_item = {k: (str(v) if isinstance(v, (list, dict)) else v) for k, v in item.items()}
                    writer.writerow(flat_item)
        
        return csv_path
    except Exception as e:
        return None

# -------------------------
# Build the Gradio UI
# -------------------------
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
    
    # Styled Header Block
    with gr.HTML(elem_classes="hero-container"):
        gr.Markdown(
            f"""

            # 🛟 The Data Rescuer

            Turn messy logs, disorganized lists, automated transcripts, and raw OCR scripts into highly structured business-ready assets — powered by `{MODEL_ID}`.

            """
        )
    
    with gr.Row():
        # Left Column: Inputs
        with gr.Column(scale=1):
            raw_input = gr.Textbox(
                label="1. Paste Unstructured Text",
                placeholder="Paste your messy meeting notes, emails, or raw text here...",
                lines=12
            )
            
            schema_input = gr.Textbox(
                label="2. What fields do you want to extract?",
                placeholder="e.g., Company Name, Contact Person, Deadline, Action Items (list)",
                lines=3
            )
            
            extract_btn = gr.Button("🚀 Extract Structured Data", variant="primary", elem_classes="primary-btn")
            
        # Right Column: Multi-view Output Panels
        with gr.Column(scale=1):
            # Dynamic HTML summary cards (Dashboard metrics style)
            kpi_output = gr.HTML(
                value="""

                <div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>

                    Await extraction to generate KPI metrics...

                </div>

                """
            )
            
            with gr.Tabs():
                with gr.TabItem("📊 Structured Table"):
                    table_output = gr.Dataframe(
                        headers=["Field Name", "Extracted Value"],
                        datatype=["str", "str"],
                        interactive=False,
                        wrap=True
                    )
                with gr.TabItem("🔍 Raw JSON Tree"):
                    json_output = gr.JSON(label="JSON Object")
            
            # Action controls below outputs
            with gr.Row():
                export_btn = gr.Button("💾 Build Export File", variant="secondary", elem_classes="secondary-btn")
                csv_output = gr.File(label="Ready for Download", interactive=False)

    # -------------------------
    # Examples Panel
    # -------------------------
    gr.Markdown("### Try it out with these examples:")
    gr.Examples(
        examples=[
            [
                "Hey guys, quick recap of today's sync. Sarah is going to handle the frontend React components by next Tuesday. John, you need to fix the database migration issue before Friday. Also, our client 'Acme Corp' wants the final delivery by October 15th.", 
                "Task Owner, Task Description, Deadline, Client Name"
            ],
            [
                "Invoice #99214. From: BlueTech Software. To: Jane Doe. Items: 1x Server Maintenance ($500), 2x Cloud Storage ($100 each). Total due: $700. Please pay by end of month.", 
                "Invoice Number, Sender, Recipient, Items (list of names and prices), Total Amount"
            ]
        ],
        inputs=[raw_input, schema_input],
        label="Click an example to populate the inputs"
    )

    # -------------------------
    # Event Connections
    # -------------------------
    # 1. Connect extraction button to the Table View, JSON Tree, and KPI output
    extract_btn.click(
        fn=extract_data,
        inputs=[raw_input, schema_input],
        outputs=[json_output, table_output, kpi_output]
    )
    
    # 2. Connect CSV generation
    export_btn.click(
        fn=generate_csv,
        inputs=[json_output],
        outputs=[csv_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()