Spaces:

build-small-hackathon
/

Structured-Data-Rescuer

Running

App Files Files Community

TensorVizion commited on 24 days ago

Commit

080d73e

verified ·

1 Parent(s): f79c1f2

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -11

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import gradio as gr
 import json
-import re
 import os
 import csv
 import tempfile
 from huggingface_hub import InferenceClient
 # Replace this with your exact model repo ID
 MODEL_ID = "tensorvizion/O-wen-4.6"
 # Securely load the Hugging Face token from Space secrets
@@ -22,7 +22,7 @@ def extract_data(raw_text, fields_to_extract):
     if not raw_text.strip() or not fields_to_extract.strip():
         return {"error": "Please provide both raw text and fields to extract."}
-    # Construct the system instruction for O-wen 4.6
     system_prompt = (
         "You are an expert data extraction assistant. Your job is to extract specific "
         "information from messy, unstructured text and output it as clean, valid JSON.\n"
@@ -40,7 +40,7 @@ def extract_data(raw_text, fields_to_extract):
     ]
     try:
-        # Call O-wen 4.6 via the chat completion API
         response = client.chat_completion(
             messages=messages,
             max_tokens=1024,
@@ -49,13 +49,21 @@ def extract_data(raw_text, fields_to_extract):
         output_text = response.choices[0].message.content.strip()
-        # Fallback: Safely strip markdown code blocks without using complex regex
-        if output_text.startswith("```"):
-            output_text = re.sub(r"^```(?:json)?\n?", "", output_text)
-            output_text = re.sub(r"\n?```$", "", output_text)
-        # Parse the text into an actual JSON dictionary for the Gradio UI
-        structured_data = json.loads(output_text.strip())
         return structured_data
     except json.JSONDecodeError:
@@ -64,7 +72,20 @@ def extract_data(raw_text, fields_to_extract):
             "raw_output": output_text
         }
     except Exception as e:
-        return {"error": str(e)}
 def generate_csv(json_data):
     """Converts the JSON output into a downloadable CSV file."""
@@ -73,6 +94,8 @@ def generate_csv(json_data):
     # Normalize data into a list of dictionaries for the CSV writer
     if isinstance(json_data, dict):
         data_list = [json_data]
     elif isinstance(json_data, list):
         data_list = json_data
@@ -92,6 +115,9 @@ def generate_csv(json_data):
                     headers.update(item.keys())
             headers = list(headers)
             writer = csv.DictWriter(f, fieldnames=headers)
             writer.writeheader()

 import gradio as gr
 import json
 import os
 import csv
 import tempfile
 from huggingface_hub import InferenceClient
 # Replace this with your exact model repo ID
+# Note: Ensure exact casing. If the model is a GGUF, we will need to change how this runs.
 MODEL_ID = "tensorvizion/O-wen-4.6"
 # Securely load the Hugging Face token from Space secrets
     if not raw_text.strip() or not fields_to_extract.strip():
         return {"error": "Please provide both raw text and fields to extract."}
+    # Construct the system instruction
     system_prompt = (
         "You are an expert data extraction assistant. Your job is to extract specific "
         "information from messy, unstructured text and output it as clean, valid JSON.\n"
     ]
     try:
+        # Call the model via the chat completion API
         response = client.chat_completion(
             messages=messages,
             max_tokens=1024,
         output_text = response.choices[0].message.content.strip()
+        # Fallback: Safely strip markdown code blocks without regular expressions
+        cleaned_text = output_text
+        if cleaned_text.startswith("```"):
+            lines = cleaned_text.splitlines()
+            if len(lines) >= 2:
+                # Discard the opening line (e.g., ```json or ```)
+                if lines[0].startswith("```"):
+                    lines = lines[1:]
+                # Discard the closing line (e.g., ```)
+                if lines and lines[-1].strip() == "```":
+                    lines = lines[:-1]
+                cleaned_text = "\n".join(lines).strip()
+        # Parse the text into an actual JSON dictionary
+        structured_data = json.loads(cleaned_text)
         return structured_data
     except json.JSONDecodeError:
             "raw_output": output_text
         }
     except Exception as e:
+        error_msg = str(e)
+        # Enhanced error handling for model connectivity issues
+        if "model_not_found" in error_msg or "does not exist" in error_msg:
+            return {
+                "error": f"The model '{MODEL_ID}' was not found on Hugging Face.",
+                "troubleshooting": [
+                    "1. Check your Hugging Face repo for typos in the MODEL_ID string (it is case-sensitive).",
+                    "2. If the model is Private, ensure your HF_TOKEN has read access.",
+                    "3. If your model is a GGUF or LoRA adapter, the Serverless API does not support it directly.",
+                    "Test by temporarily changing MODEL_ID to 'Qwen/Qwen2.5-7B-Instruct' to verify the app works."
+                ],
+                "raw_error": error_msg
+            }
+        return {"error": error_msg}
 def generate_csv(json_data):
     """Converts the JSON output into a downloadable CSV file."""
     # Normalize data into a list of dictionaries for the CSV writer
     if isinstance(json_data, dict):
+        if "error" in json_data:
+            return None
         data_list = [json_data]
     elif isinstance(json_data, list):
         data_list = json_data
                     headers.update(item.keys())
             headers = list(headers)
+            if not headers:
+                return None
             writer = csv.DictWriter(f, fieldnames=headers)
             writer.writeheader()