Spaces:

build-small-hackathon
/

Structured-Data-Rescuer

Running

App Files Files Community

TensorVizion commited on 25 days ago

Commit

4b61207

verified ·

1 Parent(s): 3409101

Upload 2 files

Browse files

Files changed (2) hide show

app.py +101 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+import json
+import re
+from huggingface_hub import InferenceClient
+# Replace this with your exact model repo ID
+# e.g., "tensorvizion/O-wen-4.6"
+MODEL_ID = "tensorvizion/O-wen-4.6"
+# Initialize the HF inference client
+# Note: In a private Space, you would pass token=os.environ["HF_TOKEN"]
+client = InferenceClient(model=MODEL_ID)
+def extract_data(raw_text, fields_to_extract):
+    if not raw_text.strip() or not fields_to_extract.strip():
+        return {"error": "Please provide both raw text and fields to extract."}
+    # Construct the system instruction for O-wen 4.6
+    system_prompt = (
+        "You are an expert data extraction assistant. Your job is to extract specific "
+        "information from messy, unstructured text and output it as clean, valid JSON.\n"
+        "Rules:\n"
+        "1. Only extract the fields requested.\n"
+        "2. If a field is not found in the text, return 'null' for that field.\n"
+        "3. Output ONLY a raw JSON object. Do not include markdown formatting, backticks, or conversational text."
+    )
+    user_prompt = f"Fields to extract:\n{fields_to_extract}\n\nUnstructured Text:\n{raw_text}"
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt}
+    ]
+    try:
+        # Call O-wen 4.6 via the chat completion API
+        response = client.chat_completion(
+            messages=messages,
+            max_tokens=1024,
+            temperature=0.1, # Low temperature for more deterministic/factual data extraction
+        )
+        output_text = response.choices[0].message.content.strip()
+        # Fallback: Sometimes models still wrap JSON in markdown backticks (```json ...
+```)
+        # This regex strips the markdown so the json.loads() doesn't crash
+        json_match = re.search(r'```json\n(.*?)\n```', output_text, re.DOTALL)
+        if json_match:
+            output_text = json_match.group(1)
+        # Parse the text into an actual JSON dictionary for the Gradio UI
+        structured_data = json.loads(output_text)
+        return structured_data
+    except json.JSONDecodeError:
+        return {
+            "error": "The model failed to return valid JSON. It returned this instead:",
+            "raw_output": output_text
+        }
+    except Exception as e:
+        return {"error": str(e)}
+# -------------------------
+# Build the Gradio UI
+# -------------------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🛟 The Data Rescuer")
+    gr.Markdown(f"**Powered by `{MODEL_ID}`** | Turn messy transcripts, notes, and OCR text into clean JSON data.")
+    with gr.Row():
+        # Left Column: Inputs
+        with gr.Column():
+            raw_input = gr.Textbox(
+                label="1. Paste Unstructured Text",
+                placeholder="Paste your messy meeting notes, emails, or raw text here...",
+                lines=12
+            )
+            schema_input = gr.Textbox(
+                label="2. What fields do you want to extract?",
+                placeholder="e.g., Company Name, Contact Person, Deadline, Action Items (list)",
+                lines=3
+            )
+            extract_btn = gr.Button("Extract Structured Data", variant="primary")
+        # Right Column: Output
+        with gr.Column():
+            json_output = gr.JSON(label="Structured Output")
+    # Connect the button to the function
+    extract_btn.click(
+        fn=extract_data,
+        inputs=[raw_input, schema_input],
+        outputs=json_output
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio
2	+ huggingface_hub