Update app.py
Browse files
app.py
CHANGED
|
@@ -1,15 +1,24 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import json
|
| 3 |
import re
|
|
|
|
|
|
|
|
|
|
| 4 |
from huggingface_hub import InferenceClient
|
| 5 |
|
| 6 |
# Replace this with your exact model repo ID
|
| 7 |
MODEL_ID = "tensorvizion/O-wen-4.6"
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def extract_data(raw_text, fields_to_extract):
|
|
|
|
|
|
|
|
|
|
| 13 |
if not raw_text.strip() or not fields_to_extract.strip():
|
| 14 |
return {"error": "Please provide both raw text and fields to extract."}
|
| 15 |
|
|
@@ -41,10 +50,8 @@ def extract_data(raw_text, fields_to_extract):
|
|
| 41 |
output_text = response.choices[0].message.content.strip()
|
| 42 |
|
| 43 |
# Fallback: Safely strip markdown code blocks without using complex regex
|
| 44 |
-
# that might break code editors during copy-pasting
|
| 45 |
if output_text.startswith("```"):
|
| 46 |
-
|
| 47 |
-
# Remove the ending ```
|
| 48 |
output_text = re.sub(r"\n?```$", "", output_text)
|
| 49 |
|
| 50 |
# Parse the text into an actual JSON dictionary for the Gradio UI
|
|
@@ -59,12 +66,48 @@ def extract_data(raw_text, fields_to_extract):
|
|
| 59 |
except Exception as e:
|
| 60 |
return {"error": str(e)}
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 66 |
gr.Markdown("# 🛟 The Data Rescuer")
|
| 67 |
-
gr.Markdown(f"**Powered by `{MODEL_ID}`** | Turn messy transcripts, notes, and OCR text into clean JSON data.")
|
| 68 |
|
| 69 |
with gr.Row():
|
| 70 |
# Left Column: Inputs
|
|
@@ -86,13 +129,40 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 86 |
# Right Column: Output
|
| 87 |
with gr.Column():
|
| 88 |
json_output = gr.JSON(label="Structured Output")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
# Connect
|
| 91 |
extract_btn.click(
|
| 92 |
fn=extract_data,
|
| 93 |
inputs=[raw_input, schema_input],
|
| 94 |
outputs=json_output
|
| 95 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
# Launch the app
|
| 98 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import json
|
| 3 |
import re
|
| 4 |
+
import os
|
| 5 |
+
import csv
|
| 6 |
+
import tempfile
|
| 7 |
from huggingface_hub import InferenceClient
|
| 8 |
|
| 9 |
# Replace this with your exact model repo ID
|
| 10 |
MODEL_ID = "tensorvizion/O-wen-4.6"
|
| 11 |
|
| 12 |
+
# Securely load the Hugging Face token from Space secrets
|
| 13 |
+
hf_token = os.environ.get("HF_TOKEN")
|
| 14 |
+
|
| 15 |
+
# Initialize the HF inference client with the token
|
| 16 |
+
client = InferenceClient(model=MODEL_ID, token=hf_token)
|
| 17 |
|
| 18 |
def extract_data(raw_text, fields_to_extract):
|
| 19 |
+
if not hf_token:
|
| 20 |
+
return {"error": "HF_TOKEN secret is missing. Please add your Hugging Face Access Token to the Space Secrets."}
|
| 21 |
+
|
| 22 |
if not raw_text.strip() or not fields_to_extract.strip():
|
| 23 |
return {"error": "Please provide both raw text and fields to extract."}
|
| 24 |
|
|
|
|
| 50 |
output_text = response.choices[0].message.content.strip()
|
| 51 |
|
| 52 |
# Fallback: Safely strip markdown code blocks without using complex regex
|
|
|
|
| 53 |
if output_text.startswith("```"):
|
| 54 |
+
output_text = re.sub(r"^```(?:json)?\n?", "", output_text)
|
|
|
|
| 55 |
output_text = re.sub(r"\n?```$", "", output_text)
|
| 56 |
|
| 57 |
# Parse the text into an actual JSON dictionary for the Gradio UI
|
|
|
|
| 66 |
except Exception as e:
|
| 67 |
return {"error": str(e)}
|
| 68 |
|
| 69 |
+
def generate_csv(json_data):
|
| 70 |
+
"""Converts the JSON output into a downloadable CSV file."""
|
| 71 |
+
if not json_data or "error" in json_data:
|
| 72 |
+
return None
|
| 73 |
+
|
| 74 |
+
# Normalize data into a list of dictionaries for the CSV writer
|
| 75 |
+
if isinstance(json_data, dict):
|
| 76 |
+
data_list = [json_data]
|
| 77 |
+
elif isinstance(json_data, list):
|
| 78 |
+
data_list = json_data
|
| 79 |
+
else:
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
# Create a secure temporary file to hold the CSV
|
| 83 |
+
temp_dir = tempfile.mkdtemp()
|
| 84 |
+
csv_path = os.path.join(temp_dir, "extracted_data.csv")
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
|
| 88 |
+
# Gather all possible column headers from the JSON keys
|
| 89 |
+
headers = set()
|
| 90 |
+
for item in data_list:
|
| 91 |
+
if isinstance(item, dict):
|
| 92 |
+
headers.update(item.keys())
|
| 93 |
+
headers = list(headers)
|
| 94 |
+
|
| 95 |
+
writer = csv.DictWriter(f, fieldnames=headers)
|
| 96 |
+
writer.writeheader()
|
| 97 |
+
|
| 98 |
+
for item in data_list:
|
| 99 |
+
if isinstance(item, dict):
|
| 100 |
+
# Convert nested lists/dicts to strings so the CSV doesn't break
|
| 101 |
+
flat_item = {k: (str(v) if isinstance(v, (list, dict)) else v) for k, v in item.items()}
|
| 102 |
+
writer.writerow(flat_item)
|
| 103 |
+
|
| 104 |
+
return csv_path
|
| 105 |
+
except Exception as e:
|
| 106 |
+
return None
|
| 107 |
+
|
| 108 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 109 |
gr.Markdown("# 🛟 The Data Rescuer")
|
| 110 |
+
gr.Markdown(f"**Powered by `{MODEL_ID}`** | Turn messy transcripts, notes, and OCR text into clean JSON and CSV data.")
|
| 111 |
|
| 112 |
with gr.Row():
|
| 113 |
# Left Column: Inputs
|
|
|
|
| 129 |
# Right Column: Output
|
| 130 |
with gr.Column():
|
| 131 |
json_output = gr.JSON(label="Structured Output")
|
| 132 |
+
|
| 133 |
+
# CSV Export UI
|
| 134 |
+
export_btn = gr.Button("⬇️ Download as CSV", variant="secondary")
|
| 135 |
+
csv_output = gr.File(label="Your CSV File", interactive=False)
|
| 136 |
+
|
| 137 |
+
gr.Markdown("### Try it out with these examples:")
|
| 138 |
+
gr.Examples(
|
| 139 |
+
examples=[
|
| 140 |
+
[
|
| 141 |
+
"Hey guys, quick recap of today's sync. Sarah is going to handle the frontend React components by next Tuesday. John, you need to fix the database migration issue before Friday. Also, our client 'Acme Corp' wants the final delivery by October 15th.",
|
| 142 |
+
"Task Owner, Task Description, Deadline, Client Name"
|
| 143 |
+
],
|
| 144 |
+
[
|
| 145 |
+
"Invoice #99214. From: BlueTech Software. To: Jane Doe. Items: 1x Server Maintenance ($500), 2x Cloud Storage ($100 each). Total due: $700. Please pay by end of month.",
|
| 146 |
+
"Invoice Number, Sender, Recipient, Items (list of names and prices), Total Amount"
|
| 147 |
+
]
|
| 148 |
+
],
|
| 149 |
+
inputs=[raw_input, schema_input],
|
| 150 |
+
label="Click an example to populate the inputs"
|
| 151 |
+
)
|
| 152 |
|
| 153 |
+
# 1. Connect extraction button
|
| 154 |
extract_btn.click(
|
| 155 |
fn=extract_data,
|
| 156 |
inputs=[raw_input, schema_input],
|
| 157 |
outputs=json_output
|
| 158 |
)
|
| 159 |
+
|
| 160 |
+
# 2. Connect CSV export button
|
| 161 |
+
export_btn.click(
|
| 162 |
+
fn=generate_csv,
|
| 163 |
+
inputs=[json_output],
|
| 164 |
+
outputs=[csv_output]
|
| 165 |
+
)
|
| 166 |
|
| 167 |
# Launch the app
|
| 168 |
if __name__ == "__main__":
|