File size: 14,603 Bytes
4b61207
 
f79c1f2
 
 
4b61207
 
 
10f2c4b
4b61207
f79c1f2
 
 
 
 
4b61207
5fe2bf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789c331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b61207
f79c1f2
789c331
 
f79c1f2
4b61207
789c331
 
4b61207
080d73e
4b61207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
080d73e
4b61207
 
 
0db4a3c
4b61207
 
0db4a3c
4b61207
080d73e
 
 
 
 
 
 
 
 
 
203b146
080d73e
 
5fe2bf9
 
 
 
 
789c331
5fe2bf9
 
 
 
 
789c331
203b146
 
5fe2bf9
203b146
 
 
789c331
203b146
080d73e
 
5fe2bf9
080d73e
 
5fe2bf9
 
 
 
080d73e
789c331
 
 
203b146
f79c1f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
080d73e
 
 
f79c1f2
 
 
 
 
 
 
 
 
 
 
 
5fe2bf9
 
 
 
 
 
 
 
 
 
 
 
 
203b146
 
 
5fe2bf9
203b146
 
 
 
 
 
 
 
 
 
 
 
5fe2bf9
203b146
5fe2bf9
 
789c331
 
 
 
 
 
 
 
 
5fe2bf9
 
 
 
 
 
 
 
 
 
f79c1f2
5fe2bf9
 
 
 
f79c1f2
5fe2bf9
 
 
f79c1f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203b146
5fe2bf9
 
 
789c331
203b146
 
 
789c331
203b146
f79c1f2
5fe2bf9
f79c1f2
 
 
 
 
203b146
 
 
19f15ec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
import gradio as gr
import json
import os
import csv
import tempfile
from huggingface_hub import InferenceClient

# Replace this with your exact model repo ID
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" 

# Securely load the Hugging Face token from Space secrets
hf_token = os.environ.get("HF_TOKEN")

# Initialize the HF inference client with the token
client = InferenceClient(model=MODEL_ID, token=hf_token)

# -------------------------
# Custom CSS Styling
# -------------------------
custom_css = """

.hero-container {

    background: linear-gradient(135deg, #6366f1 0%, #14b8a6 100%);

    padding: 2.5rem;

    border-radius: 20px;

    color: white;

    margin-bottom: 2rem;

    box-shadow: 0 10px 25px -5px rgba(99, 102, 241, 0.2);

}

.hero-container h1 {

    color: white !important;

    font-size: 2.5rem !important;

    font-weight: 800 !important;

    margin-bottom: 0.5rem;

    text-shadow: 0 2px 4px rgba(0,0,0,0.1);

}

.hero-container p {

    color: rgba(255, 255, 255, 0.9) !important;

    font-size: 1.1rem !important;

}

.primary-btn {

    background: linear-gradient(90deg, #6366f1 0%, #14b8a6 100%) !important;

    border: none !important;

    color: white !important;

    font-weight: 600 !important;

    border-radius: 10px !important;

    transition: all 0.3s ease !important;

    padding: 12px 24px !important;

}

.primary-btn:hover {

    transform: translateY(-2px);

    box-shadow: 0 8px 20px -5px rgba(99, 102, 241, 0.4);

}

.secondary-btn {

    border-radius: 10px !important;

    font-weight: 600 !important;

}

.feedback-card {

    border-left: 4px solid #6366f1;

    background-color: rgba(99, 102, 241, 0.05);

}

"""

# -------------------------
# Helper & Extraction Logic
# -------------------------
def generate_kpi_html(structured_data):
    """Generates modern, responsive KPI metrics cards dynamically based on JSON data."""
    if not structured_data or "error" in structured_data:
        return """

        <div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>

            Await extraction to generate KPI metrics...

        </div>

        """
    
    cards_html = ""
    if isinstance(structured_data, dict):
        # Pick the top 4 attributes to show as metrics
        items = list(structured_data.items())[:4]
        for key, val in items:
            # Clean up the key label
            display_key = str(key).replace("_", " ").replace("-", " ").title()
            
            # Format list value representation
            if isinstance(val, list):
                display_val = ", ".join(map(str, val))
            else:
                display_val = str(val)
            
            # Truncate if string is too long for the card layout
            if len(display_val) > 40:
                display_val = display_val[:37] + "..."
                
            # Dynamic highlight accents based on field types
            accent_color = "#6366f1" # default Indigo
            if any(x in display_key.lower() for x in ["price", "total", "amount", "cost", "revenue", "budget"]):
                accent_color = "#10b981" # Emerald for cash/costs
            elif any(x in display_key.lower() for x in ["date", "deadline", "due", "time"]):
                accent_color = "#f59e0b" # Amber for dates/reminders
            elif any(x in display_key.lower() for x in ["status", "priority", "importance"]):
                accent_color = "#ef4444" # Crimson for status/alerts
                
            cards_html += f"""

            <div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid {accent_color}; min-width: 140px; flex: 1;'>

                <div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>{display_key}</div>

                <div style='font-size: 1.05rem; color: var(--body-text-color, #111827); font-weight: 800; word-break: break-word;'>{display_val}</div>

            </div>

            """
    elif isinstance(structured_data, list):
        # Summary KPI for array data structures
        cards_html = f"""

        <div style='background: var(--body-background-fill, #ffffff); padding: 1rem; border-radius: 12px; box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.05); border: 1px solid var(--border-color-primary, #e5e7eb); border-left: 5px solid #6366f1; min-width: 140px; flex: 1;'>

            <div style='font-size: 0.7rem; color: var(--text-color-subdued, #6b7280); text-transform: uppercase; font-weight: 700; letter-spacing: 0.05em; margin-bottom: 0.25rem;'>Total Records Found</div>

            <div style='font-size: 1.5rem; color: var(--body-text-color, #111827); font-weight: 800;'>{len(structured_data)}</div>

        </div>

        """
        
    return f"""

    <div style='display: flex; flex-wrap: wrap; gap: 0.75rem; margin-bottom: 1rem; width: 100%;'>

        {cards_html}

    </div>

    """

def extract_data(raw_text, fields_to_extract):
    if not hf_token:
        err_state = {"error": "HF_TOKEN secret is missing. Please add your Hugging Face Access Token to the Space Secrets."}
        return err_state, [["Error", "HF_TOKEN missing"]], generate_kpi_html(err_state)
        
    if not raw_text.strip() or not fields_to_extract.strip():
        err_state = {"error": "Please provide both raw text and fields to extract."}
        return err_state, [["Error", "Incomplete inputs"]], generate_kpi_html(err_state)

    # Construct the system instruction
    system_prompt = (
        "You are an expert data extraction assistant. Your job is to extract specific "
        "information from messy, unstructured text and output it as clean, valid JSON.\n"
        "Rules:\n"
        "1. Only extract the fields requested.\n"
        "2. If a field is not found in the text, return 'null' for that field.\n"
        "3. Output ONLY a raw JSON object. Do not include markdown formatting, backticks, or conversational text."
    )

    user_prompt = f"Fields to extract:\n{fields_to_extract}\n\nUnstructured Text:\n{raw_text}"

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    try:
        # Call the model via the chat completion API
        response = client.chat_completion(
            messages=messages,
            max_tokens=1024,
            temperature=0.1, 
        )
        
        output_text = response.choices[0].message.content.strip()

        # Fallback: Safely strip markdown code blocks without regular expressions
        cleaned_text = output_text
        if cleaned_text.startswith("```"):
            lines = cleaned_text.splitlines()
            if len(lines) >= 2:
                if lines[0].startswith("```"):
                    lines = lines[1:]
                if lines and lines[-1].strip() == "```":
                    lines = lines[:-1]
                cleaned_text = "\n".join(lines).strip()

        # Parse the text into an actual JSON dictionary
        structured_data = json.loads(cleaned_text)
        
        # Convert JSON structure to a displayable 2D list for the Table view
        table_data = []
        if isinstance(structured_data, dict):
            for k, v in structured_data.items():
                val_str = ", ".join(map(str, v)) if isinstance(v, list) else str(v)
                table_data.append([k, val_str])
        elif isinstance(structured_data, list):
            for idx, item in enumerate(structured_data):
                table_data.append([f"Item {idx + 1}", str(item)])
                
        return structured_data, table_data, generate_kpi_html(structured_data)

    except json.JSONDecodeError:
        error_dict = {
            "error": "The model failed to return valid JSON. It returned this instead:",
            "raw_output": output_text
        }
        return error_dict, [["Error", "Invalid JSON parsed"]], generate_kpi_html(error_dict)
    except Exception as e:
        error_msg = str(e)
        if "model_not_found" in error_msg or "does not exist" in error_msg:
            err_dict = {
                "error": f"The model '{MODEL_ID}' was not found on Hugging Face.",
                "troubleshooting": [
                    "1. Check your Hugging Face repo for typos (case-sensitive).",
                    "2. Verify HF_TOKEN secret read permissions.",
                    "3. GGUF or LoRA adapter models are not directly supported by the Serverless API."
                ]
            }
            return err_dict, [["Connection Error", "Model Not Found"]], generate_kpi_html(err_dict)
        err_state = {"error": error_msg}
        return err_state, [["Error", error_msg]], generate_kpi_html(err_state)

def generate_csv(json_data):
    """Converts the JSON output into a downloadable CSV file."""
    if not json_data or "error" in json_data:
        return None
    
    if isinstance(json_data, dict):
        data_list = [json_data]
    elif isinstance(json_data, list):
        data_list = json_data
    else:
        return None

    # Create a secure temporary file to hold the CSV
    temp_dir = tempfile.mkdtemp()
    csv_path = os.path.join(temp_dir, "extracted_data.csv")
    
    try:
        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
            headers = set()
            for item in data_list:
                if isinstance(item, dict):
                    headers.update(item.keys())
            headers = list(headers)
            
            if not headers:
                return None

            writer = csv.DictWriter(f, fieldnames=headers)
            writer.writeheader()
            
            for item in data_list:
                if isinstance(item, dict):
                    flat_item = {k: (str(v) if isinstance(v, (list, dict)) else v) for k, v in item.items()}
                    writer.writerow(flat_item)
        
        return csv_path
    except Exception as e:
        return None

# -------------------------
# Build the Gradio UI
# -------------------------
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
    
    # Styled Header Block
    with gr.HTML(elem_classes="hero-container"):
        gr.Markdown(
            f"""

            # πŸ›Ÿ The Data Rescuer

            Turn messy logs, disorganized lists, automated transcripts, and raw OCR scripts into highly structured business-ready assets β€” powered by `{MODEL_ID}`.

            """
        )
    
    with gr.Row():
        # Left Column: Inputs
        with gr.Column(scale=1):
            raw_input = gr.Textbox(
                label="1. Paste Unstructured Text",
                placeholder="Paste your messy meeting notes, emails, or raw text here...",
                lines=12
            )
            
            schema_input = gr.Textbox(
                label="2. What fields do you want to extract?",
                placeholder="e.g., Company Name, Contact Person, Deadline, Action Items (list)",
                lines=3
            )
            
            extract_btn = gr.Button("πŸš€ Extract Structured Data", variant="primary", elem_classes="primary-btn")
            
        # Right Column: Multi-view Output Panels
        with gr.Column(scale=1):
            # Dynamic HTML summary cards (Dashboard metrics style)
            kpi_output = gr.HTML(
                value="""

                <div style='display: flex; justify-content: center; align-items: center; height: 100px; border: 2px dashed var(--border-color-primary, #e5e7eb); border-radius: 12px; color: var(--text-color-subdued, #9ca3af);'>

                    Await extraction to generate KPI metrics...

                </div>

                """
            )
            
            with gr.Tabs():
                with gr.TabItem("πŸ“Š Structured Table"):
                    table_output = gr.Dataframe(
                        headers=["Field Name", "Extracted Value"],
                        datatype=["str", "str"],
                        interactive=False,
                        wrap=True
                    )
                with gr.TabItem("πŸ” Raw JSON Tree"):
                    json_output = gr.JSON(label="JSON Object")
            
            # Action controls below outputs
            with gr.Row():
                export_btn = gr.Button("πŸ’Ύ Build Export File", variant="secondary", elem_classes="secondary-btn")
                csv_output = gr.File(label="Ready for Download", interactive=False)

    # -------------------------
    # Examples Panel
    # -------------------------
    gr.Markdown("### Try it out with these examples:")
    gr.Examples(
        examples=[
            [
                "Hey guys, quick recap of today's sync. Sarah is going to handle the frontend React components by next Tuesday. John, you need to fix the database migration issue before Friday. Also, our client 'Acme Corp' wants the final delivery by October 15th.", 
                "Task Owner, Task Description, Deadline, Client Name"
            ],
            [
                "Invoice #99214. From: BlueTech Software. To: Jane Doe. Items: 1x Server Maintenance ($500), 2x Cloud Storage ($100 each). Total due: $700. Please pay by end of month.", 
                "Invoice Number, Sender, Recipient, Items (list of names and prices), Total Amount"
            ]
        ],
        inputs=[raw_input, schema_input],
        label="Click an example to populate the inputs"
    )

    # -------------------------
    # Event Connections
    # -------------------------
    # 1. Connect extraction button to the Table View, JSON Tree, and KPI output
    extract_btn.click(
        fn=extract_data,
        inputs=[raw_input, schema_input],
        outputs=[json_output, table_output, kpi_output]
    )
    
    # 2. Connect CSV generation
    export_btn.click(
        fn=generate_csv,
        inputs=[json_output],
        outputs=[csv_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()