File size: 14,603 Bytes
4b61207 f79c1f2 4b61207 10f2c4b 4b61207 f79c1f2 4b61207 5fe2bf9 789c331 4b61207 f79c1f2 789c331 f79c1f2 4b61207 789c331 4b61207 080d73e 4b61207 080d73e 4b61207 0db4a3c 4b61207 0db4a3c 4b61207 080d73e 203b146 080d73e 5fe2bf9 789c331 5fe2bf9 789c331 203b146 5fe2bf9 203b146 789c331 203b146 080d73e 5fe2bf9 080d73e 5fe2bf9 080d73e 789c331 203b146 f79c1f2 080d73e f79c1f2 5fe2bf9 203b146 5fe2bf9 203b146 5fe2bf9 203b146 5fe2bf9 789c331 5fe2bf9 f79c1f2 5fe2bf9 f79c1f2 5fe2bf9 f79c1f2 203b146 5fe2bf9 789c331 203b146 789c331 203b146 f79c1f2 5fe2bf9 f79c1f2 203b146 19f15ec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | import gradio as gr
import json
import os
import csv
import tempfile
from huggingface_hub import InferenceClient
# Replace this with your exact model repo ID
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
# Securely load the Hugging Face token from Space secrets
hf_token = os.environ.get("HF_TOKEN")
# Initialize the HF inference client with the token
client = InferenceClient(model=MODEL_ID, token=hf_token)
# -------------------------
# Custom CSS Styling
# -------------------------
custom_css = """
.hero-container {
background: linear-gradient(135deg, #6366f1 0%, #14b8a6 100%);
padding: 2.5rem;
border-radius: 20px;
color: white;
margin-bottom: 2rem;
box-shadow: 0 10px 25px -5px rgba(99, 102, 241, 0.2);
}
.hero-container h1 {
color: white !important;
font-size: 2.5rem !important;
font-weight: 800 !important;
margin-bottom: 0.5rem;
text-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.hero-container p {
color: rgba(255, 255, 255, 0.9) !important;
font-size: 1.1rem !important;
}
.primary-btn {
background: linear-gradient(90deg, #6366f1 0%, #14b8a6 100%) !important;
border: none !important;
color: white !important;
font-weight: 600 !important;
border-radius: 10px !important;
transition: all 0.3s ease !important;
padding: 12px 24px !important;
}
.primary-btn:hover {
transform: translateY(-2px);
box-shadow: 0 8px 20px -5px rgba(99, 102, 241, 0.4);
}
.secondary-btn {
border-radius: 10px !important;
font-weight: 600 !important;
}
.feedback-card {
border-left: 4px solid #6366f1;
background-color: rgba(99, 102, 241, 0.05);
}
"""
# -------------------------
# Helper & Extraction Logic
# -------------------------
def generate_kpi_html(structured_data):
"""Generates modern, responsive KPI metrics cards dynamically based on JSON data."""
if not structured_data or "error" in structured_data:
return """
<div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>
Await extraction to generate KPI metrics...
</div>
"""
cards_html = ""
if isinstance(structured_data, dict):
# Pick the top 4 attributes to show as metrics
items = list(structured_data.items())[:4]
for key, val in items:
# Clean up the key label
display_key = str(key).replace("_", " ").replace("-", " ").title()
# Format list value representation
if isinstance(val, list):
display_val = ", ".join(map(str, val))
else:
display_val = str(val)
# Truncate if string is too long for the card layout
if len(display_val) > 40:
display_val = display_val[:37] + "..."
# Dynamic highlight accents based on field types
accent_color = "#6366f1" # default Indigo
if any(x in display_key.lower() for x in ["price", "total", "amount", "cost", "revenue", "budget"]):
accent_color = "#10b981" # Emerald for cash/costs
elif any(x in display_key.lower() for x in ["date", "deadline", "due", "time"]):
accent_color = "#f59e0b" # Amber for dates/reminders
elif any(x in display_key.lower() for x in ["status", "priority", "importance"]):
accent_color = "#ef4444" # Crimson for status/alerts
cards_html += f"""
<div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid {accent_color}; min-width: 140px; flex: 1;'>
<div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>{display_key}</div>
<div style='font-size: 1.05rem; color: var(--body-text-color, #111827); font-weight: 800; word-break: break-word;'>{display_val}</div>
</div>
"""
elif isinstance(structured_data, list):
# Summary KPI for array data structures
cards_html = f"""
<div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid #6366f1; min-width: 140px; flex: 1;'>
<div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>Total Records Found</div>
<div style='font-size: 1.5rem; color: var(--body-text-color, #111827); font-weight: 800;'>{len(structured_data)}</div>
</div>
"""
return f"""
<div style='display: flex; flex-wrap: wrap; gap: 0.75rem; margin-bottom: 1rem; width: 100%;'>
{cards_html}
</div>
"""
def extract_data(raw_text, fields_to_extract):
if not hf_token:
err_state = {"error": "HF_TOKEN secret is missing. Please add your Hugging Face Access Token to the Space Secrets."}
return err_state, [["Error", "HF_TOKEN missing"]], generate_kpi_html(err_state)
if not raw_text.strip() or not fields_to_extract.strip():
err_state = {"error": "Please provide both raw text and fields to extract."}
return err_state, [["Error", "Incomplete inputs"]], generate_kpi_html(err_state)
# Construct the system instruction
system_prompt = (
"You are an expert data extraction assistant. Your job is to extract specific "
"information from messy, unstructured text and output it as clean, valid JSON.\n"
"Rules:\n"
"1. Only extract the fields requested.\n"
"2. If a field is not found in the text, return 'null' for that field.\n"
"3. Output ONLY a raw JSON object. Do not include markdown formatting, backticks, or conversational text."
)
user_prompt = f"Fields to extract:\n{fields_to_extract}\n\nUnstructured Text:\n{raw_text}"
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
try:
# Call the model via the chat completion API
response = client.chat_completion(
messages=messages,
max_tokens=1024,
temperature=0.1,
)
output_text = response.choices[0].message.content.strip()
# Fallback: Safely strip markdown code blocks without regular expressions
cleaned_text = output_text
if cleaned_text.startswith("```"):
lines = cleaned_text.splitlines()
if len(lines) >= 2:
if lines[0].startswith("```"):
lines = lines[1:]
if lines and lines[-1].strip() == "```":
lines = lines[:-1]
cleaned_text = "\n".join(lines).strip()
# Parse the text into an actual JSON dictionary
structured_data = json.loads(cleaned_text)
# Convert JSON structure to a displayable 2D list for the Table view
table_data = []
if isinstance(structured_data, dict):
for k, v in structured_data.items():
val_str = ", ".join(map(str, v)) if isinstance(v, list) else str(v)
table_data.append([k, val_str])
elif isinstance(structured_data, list):
for idx, item in enumerate(structured_data):
table_data.append([f"Item {idx + 1}", str(item)])
return structured_data, table_data, generate_kpi_html(structured_data)
except json.JSONDecodeError:
error_dict = {
"error": "The model failed to return valid JSON. It returned this instead:",
"raw_output": output_text
}
return error_dict, [["Error", "Invalid JSON parsed"]], generate_kpi_html(error_dict)
except Exception as e:
error_msg = str(e)
if "model_not_found" in error_msg or "does not exist" in error_msg:
err_dict = {
"error": f"The model '{MODEL_ID}' was not found on Hugging Face.",
"troubleshooting": [
"1. Check your Hugging Face repo for typos (case-sensitive).",
"2. Verify HF_TOKEN secret read permissions.",
"3. GGUF or LoRA adapter models are not directly supported by the Serverless API."
]
}
return err_dict, [["Connection Error", "Model Not Found"]], generate_kpi_html(err_dict)
err_state = {"error": error_msg}
return err_state, [["Error", error_msg]], generate_kpi_html(err_state)
def generate_csv(json_data):
"""Converts the JSON output into a downloadable CSV file."""
if not json_data or "error" in json_data:
return None
if isinstance(json_data, dict):
data_list = [json_data]
elif isinstance(json_data, list):
data_list = json_data
else:
return None
# Create a secure temporary file to hold the CSV
temp_dir = tempfile.mkdtemp()
csv_path = os.path.join(temp_dir, "extracted_data.csv")
try:
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
headers = set()
for item in data_list:
if isinstance(item, dict):
headers.update(item.keys())
headers = list(headers)
if not headers:
return None
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
for item in data_list:
if isinstance(item, dict):
flat_item = {k: (str(v) if isinstance(v, (list, dict)) else v) for k, v in item.items()}
writer.writerow(flat_item)
return csv_path
except Exception as e:
return None
# -------------------------
# Build the Gradio UI
# -------------------------
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
# Styled Header Block
with gr.HTML(elem_classes="hero-container"):
gr.Markdown(
f"""
# π The Data Rescuer
Turn messy logs, disorganized lists, automated transcripts, and raw OCR scripts into highly structured business-ready assets β powered by `{MODEL_ID}`.
"""
)
with gr.Row():
# Left Column: Inputs
with gr.Column(scale=1):
raw_input = gr.Textbox(
label="1. Paste Unstructured Text",
placeholder="Paste your messy meeting notes, emails, or raw text here...",
lines=12
)
schema_input = gr.Textbox(
label="2. What fields do you want to extract?",
placeholder="e.g., Company Name, Contact Person, Deadline, Action Items (list)",
lines=3
)
extract_btn = gr.Button("π Extract Structured Data", variant="primary", elem_classes="primary-btn")
# Right Column: Multi-view Output Panels
with gr.Column(scale=1):
# Dynamic HTML summary cards (Dashboard metrics style)
kpi_output = gr.HTML(
value="""
<div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>
Await extraction to generate KPI metrics...
</div>
"""
)
with gr.Tabs():
with gr.TabItem("π Structured Table"):
table_output = gr.Dataframe(
headers=["Field Name", "Extracted Value"],
datatype=["str", "str"],
interactive=False,
wrap=True
)
with gr.TabItem("π Raw JSON Tree"):
json_output = gr.JSON(label="JSON Object")
# Action controls below outputs
with gr.Row():
export_btn = gr.Button("πΎ Build Export File", variant="secondary", elem_classes="secondary-btn")
csv_output = gr.File(label="Ready for Download", interactive=False)
# -------------------------
# Examples Panel
# -------------------------
gr.Markdown("### Try it out with these examples:")
gr.Examples(
examples=[
[
"Hey guys, quick recap of today's sync. Sarah is going to handle the frontend React components by next Tuesday. John, you need to fix the database migration issue before Friday. Also, our client 'Acme Corp' wants the final delivery by October 15th.",
"Task Owner, Task Description, Deadline, Client Name"
],
[
"Invoice #99214. From: BlueTech Software. To: Jane Doe. Items: 1x Server Maintenance ($500), 2x Cloud Storage ($100 each). Total due: $700. Please pay by end of month.",
"Invoice Number, Sender, Recipient, Items (list of names and prices), Total Amount"
]
],
inputs=[raw_input, schema_input],
label="Click an example to populate the inputs"
)
# -------------------------
# Event Connections
# -------------------------
# 1. Connect extraction button to the Table View, JSON Tree, and KPI output
extract_btn.click(
fn=extract_data,
inputs=[raw_input, schema_input],
outputs=[json_output, table_output, kpi_output]
)
# 2. Connect CSV generation
export_btn.click(
fn=generate_csv,
inputs=[json_output],
outputs=[csv_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch()
|