Seth0330 commited on
Commit
919773e
·
verified ·
1 Parent(s): 38f8171

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +398 -0
app.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import io
3
+ import base64
4
+ import pandas as pd
5
+ from PIL import Image
6
+ from datetime import datetime
7
+ import csv
8
+ import json
9
+ import os
10
+ import requests
11
+
12
+ # Optional PDF support via PyMuPDF
13
+ try:
14
+ import fitz # PyMuPDF
15
+ PDF_SUPPORT = True
16
+ except ImportError:
17
+ PDF_SUPPORT = False
18
+
19
+ # ---------------------------
20
+ # Page config
21
+ # ---------------------------
22
+ st.set_page_config(
23
+ page_title="Curiosity AI Scans",
24
+ page_icon="🔍",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded"
27
+ )
28
+
29
+ # ---------------------------
30
+ # Helpers
31
+ # ---------------------------
32
+ def resize_image(image, max_size=1920):
33
+ w, h = image.size
34
+ if w > max_size or h > max_size:
35
+ if w > h:
36
+ nw = max_size
37
+ nh = int(h * (max_size / w))
38
+ else:
39
+ nh = max_size
40
+ nw = int(w * (max_size / h))
41
+ return image.resize((nw, nh), Image.LANCZOS)
42
+ return image
43
+
44
+ def image_to_base64(image):
45
+ buf = io.BytesIO()
46
+ image.save(buf, format='JPEG')
47
+ return base64.b64encode(buf.getvalue()).decode('utf-8')
48
+
49
+ def extract_structured_data(content, fields):
50
+ """Try to pull a JSON object for the requested fields out of model text."""
51
+ structured_data = {}
52
+ try:
53
+ # Fenced JSON
54
+ if "```json" in content and "```" in content.split("```json")[1]:
55
+ json_str = content.split("```json")[1].split("```")[0].strip()
56
+ structured_data.update(json.loads(json_str))
57
+ else:
58
+ # As a fallback, attempt to parse whole content if it looks like JSON
59
+ try:
60
+ maybe = json.loads(content)
61
+ if isinstance(maybe, dict):
62
+ structured_data.update(maybe)
63
+ except Exception:
64
+ pass
65
+ except Exception:
66
+ pass
67
+ return structured_data
68
+
69
+ # ---------------------------
70
+ # OpenRouter client
71
+ # ---------------------------
72
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # set this in Space Secrets
73
+
74
+ def query_openrouter(prompt: str, image_base64: str, model_id: str) -> str:
75
+ if not OPENROUTER_API_KEY:
76
+ raise RuntimeError("Missing OPENROUTER_API_KEY. Add it in your Space → Settings → Variables & secrets.")
77
+
78
+ data_url = f"data:image/jpeg;base64,{image_base64}"
79
+
80
+ payload = {
81
+ "model": model_id, # e.g., "google/gemma-3-4b-it"
82
+ "messages": [
83
+ {
84
+ "role": "user",
85
+ "content": [
86
+ {"type": "text", "text": prompt},
87
+ {"type": "image_url", "image_url": {"url": data_url}}
88
+ ]
89
+ }
90
+ ],
91
+ "max_tokens": 800
92
+ }
93
+
94
+ headers = {
95
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
96
+ "Content-Type": "application/json",
97
+ # Optional but recommended for attribution
98
+ "HTTP-Referer": st.secrets.get("SPACE_URL", "https://hf.space"),
99
+ "X-Title": "Curiosity AI Scans"
100
+ }
101
+
102
+ r = requests.post(
103
+ "https://openrouter.ai/api/v1/chat/completions",
104
+ headers=headers,
105
+ json=payload,
106
+ timeout=120
107
+ )
108
+ r.raise_for_status()
109
+ data = r.json()
110
+ return data["choices"][0]["message"]["content"]
111
+
112
+ # ---------------------------
113
+ # Core processing
114
+ # ---------------------------
115
+ def process_image(image, filename, fields=None, model=None):
116
+ img_base64 = image_to_base64(resize_image(image))
117
+
118
+ if fields is None:
119
+ prompt = "Describe this image in detail."
120
+ content = query_openrouter(prompt, img_base64, model)
121
+ return {'filename': filename, 'description': content}, content, None
122
+ else:
123
+ fields_str = ", ".join(fields)
124
+ prompt = (
125
+ "Extract the following fields from this image and return JSON only "
126
+ f"with these exact keys: {fields_str}. If a field is missing, use an empty string."
127
+ )
128
+ content = query_openrouter(prompt, img_base64, model)
129
+ structured_data = {'filename': filename}
130
+ parsed = extract_structured_data(content, fields)
131
+ if parsed:
132
+ structured_data.update(parsed)
133
+ return {'filename': filename, 'extraction': content}, content, structured_data
134
+
135
+ def process_pdf(file_bytes, filename, fields=None, process_pages_separately=True, model=None):
136
+ """Rasterize PDF pages and run them through the same image path."""
137
+ if not PDF_SUPPORT:
138
+ yield None, None, None, filename, "PDF support requires PyMuPDF. Install pymupdf.", None
139
+ return
140
+
141
+ try:
142
+ pdf_document = fitz.open(stream=file_bytes, filetype="pdf")
143
+ page_count = len(pdf_document)
144
+
145
+ if process_pages_separately:
146
+ for page_num in range(page_count):
147
+ page = pdf_document[page_num]
148
+ pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
149
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
150
+ page_filename = f"{filename} (Page {page_num+1})"
151
+ result, content, structured_data = process_image(img, page_filename, fields, model)
152
+ yield page_num, page_count, img, page_filename, content, structured_data
153
+ else:
154
+ page = pdf_document[0]
155
+ pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5))
156
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
157
+ result, content, structured_data = process_image(img, filename, fields, model)
158
+ yield 0, page_count, img, filename, content, structured_data
159
+
160
+ except Exception as e:
161
+ yield None, None, None, filename, f"Error processing PDF: {str(e)}", None
162
+
163
+ def create_download_buttons(results, structured_results, extraction_mode):
164
+ st.header("Download Results")
165
+
166
+ # Simple CSV of descriptions or raw extraction
167
+ base_csv = io.StringIO()
168
+ base_writer = csv.writer(base_csv)
169
+ base_writer.writerow(['Filename', 'Description/Extraction'])
170
+ for r in results:
171
+ base_writer.writerow([r['filename'], r.get('description', r.get('extraction', ''))])
172
+
173
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
174
+ base_name = f"image_analysis_{ts}.csv"
175
+
176
+ st.success("All files processed.")
177
+ st.download_button(
178
+ label="Download Results (CSV)",
179
+ data=base_csv.getvalue(),
180
+ file_name=base_name,
181
+ mime="text/csv",
182
+ use_container_width=True
183
+ )
184
+
185
+ # Structured CSV if available
186
+ if extraction_mode == "Custom field extraction" and structured_results:
187
+ all_fields = set(['filename'])
188
+ for row in structured_results:
189
+ all_fields.update(row.keys())
190
+ headers = sorted(list(all_fields))
191
+ buff = io.StringIO()
192
+ w = csv.writer(buff)
193
+ w.writerow(headers)
194
+ for row in structured_results:
195
+ w.writerow([row.get(h, '') for h in headers])
196
+ st.download_button(
197
+ label="Download Structured Data (CSV)",
198
+ data=buff.getvalue(),
199
+ file_name=f"structured_data_{ts}.csv",
200
+ mime="text/csv",
201
+ use_container_width=True
202
+ )
203
+
204
+ # ---------------------------
205
+ # UI
206
+ # ---------------------------
207
+ st.title("Curiosity AI Scans")
208
+
209
+ # Session state
210
+ if 'results' not in st.session_state:
211
+ st.session_state.results = []
212
+ if 'structured_results' not in st.session_state:
213
+ st.session_state.structured_results = []
214
+
215
+ # Sidebar
216
+ with st.sidebar:
217
+ st.header("Upload Files")
218
+ uploaded_files = st.file_uploader(
219
+ "Choose images or PDFs",
220
+ accept_multiple_files=True,
221
+ type=['png', 'jpg', 'jpeg', 'pdf']
222
+ )
223
+
224
+ st.header("Model Settings")
225
+ # OpenRouter model id for Gemma 3 4B Instruct (vision)
226
+ selected_model = st.selectbox(
227
+ "Choose vision model:",
228
+ ["google/gemma-3-4b-it"],
229
+ help="OpenRouter model id"
230
+ )
231
+
232
+ extraction_mode = "General description"
233
+ pdf_process_mode = "Process each page separately"
234
+ fields = None
235
+
236
+ if uploaded_files:
237
+ st.write(f"Uploaded {len(uploaded_files)} file(s)")
238
+
239
+ st.header("Data Extraction Options")
240
+ extraction_mode = st.radio(
241
+ "Choose extraction mode:",
242
+ ["General description", "Custom field extraction"]
243
+ )
244
+
245
+ if extraction_mode == "Custom field extraction":
246
+ custom_fields = st.text_area(
247
+ "Enter fields to extract (comma separated):",
248
+ value="Invoice number, Date, Company name, Total amount"
249
+ )
250
+ fields = [f.strip() for f in custom_fields.split(",") if f.strip()]
251
+
252
+ if any(file.name.lower().endswith('.pdf') for file in uploaded_files):
253
+ pdf_process_mode = st.radio(
254
+ "How to process PDF files:",
255
+ ["Process each page separately", "Process entire PDF as one document"]
256
+ )
257
+
258
+ process_button = st.button("Process Files", use_container_width=True)
259
+ else:
260
+ process_button = False
261
+ st.info("Upload images or PDFs to begin.")
262
+
263
+ # Main processing
264
+ if uploaded_files and process_button:
265
+ if not OPENROUTER_API_KEY:
266
+ st.error("OPENROUTER_API_KEY is not set. Add it in your Space → Settings → Variables & secrets.")
267
+ else:
268
+ st.header("Processing Results")
269
+ progress_bar = st.progress(0)
270
+ status_text = st.empty()
271
+
272
+ st.session_state.results = []
273
+ st.session_state.structured_results = []
274
+
275
+ # Count items to process
276
+ total_items = 0
277
+ for f in uploaded_files:
278
+ file_bytes = f.read()
279
+ f.seek(0)
280
+ if f.name.lower().endswith('.pdf') and PDF_SUPPORT:
281
+ if pdf_process_mode == "Process each page separately":
282
+ try:
283
+ pdf_document = fitz.open(stream=file_bytes, filetype="pdf")
284
+ total_items += len(pdf_document)
285
+ except Exception:
286
+ total_items += 1
287
+ else:
288
+ total_items += 1
289
+ else:
290
+ total_items += 1
291
+
292
+ processed_count = 0
293
+
294
+ # Process files
295
+ for f in uploaded_files:
296
+ file_bytes = f.read()
297
+ f.seek(0)
298
+
299
+ if f.name.lower().endswith('.pdf'):
300
+ if not PDF_SUPPORT:
301
+ st.error("PDF support requires PyMuPDF. Add 'pymupdf' to requirements.txt.")
302
+ processed_count += 1
303
+ progress_bar.progress(processed_count / max(total_items, 1))
304
+ continue
305
+
306
+ try:
307
+ process_separately = pdf_process_mode == "Process each page separately"
308
+ for page_info in process_pdf(file_bytes, f.name, fields, process_separately, selected_model):
309
+ page_num, page_count, image, page_filename, content, structured_data = page_info
310
+ if page_num is None:
311
+ st.error(content)
312
+ continue
313
+
314
+ status_text.text(f"Processing {page_filename} ({page_num+1}/{page_count})")
315
+ result = {'filename': page_filename, 'description': content}
316
+ st.session_state.results.append(result)
317
+ if structured_data and len(structured_data) > 1:
318
+ st.session_state.structured_results.append(structured_data)
319
+
320
+ st.subheader(page_filename)
321
+ c1, c2 = st.columns([1, 2])
322
+ with c1:
323
+ st.image(image, width=250)
324
+ if page_count > 1 and not process_separately:
325
+ st.info(f"PDF has {page_count} pages. Showing first page only.")
326
+ with c2:
327
+ st.write(content)
328
+ if structured_data and len(structured_data) > 1:
329
+ st.success("Extracted structured data")
330
+ st.json(structured_data)
331
+
332
+ st.divider()
333
+ processed_count += 1
334
+ progress_bar.progress(min(processed_count / max(total_items, 1), 1.0))
335
+
336
+ except Exception as e:
337
+ st.error(f"Error processing PDF {f.name}: {e}")
338
+ processed_count += 1
339
+ progress_bar.progress(min(processed_count / max(total_items, 1), 1.0))
340
+
341
+ else:
342
+ try:
343
+ status_text.text(f"Processing image {f.name}")
344
+ image = Image.open(f).convert("RGB")
345
+ result, content, structured_data = process_image(image, f.name, fields, selected_model)
346
+ st.session_state.results.append(result)
347
+ if structured_data and len(structured_data) > 1:
348
+ st.session_state.structured_results.append(structured_data)
349
+
350
+ st.subheader(f"Image: {f.name}")
351
+ c1, c2 = st.columns([1, 2])
352
+ with c1:
353
+ st.image(image, width=250)
354
+ with c2:
355
+ st.write(content)
356
+ if structured_data and len(structured_data) > 1:
357
+ st.success("Extracted structured data")
358
+ st.json(structured_data)
359
+
360
+ st.divider()
361
+
362
+ except Exception as e:
363
+ st.error(f"Error processing image {f.name}: {e}")
364
+
365
+ processed_count += 1
366
+ progress_bar.progress(min(processed_count / max(total_items, 1), 1.0))
367
+
368
+ status_text.text("Processing complete.")
369
+
370
+ if st.session_state.results:
371
+ create_download_buttons(
372
+ st.session_state.results,
373
+ st.session_state.structured_results,
374
+ extraction_mode
375
+ )
376
+
377
+ # Empty state
378
+ if not uploaded_files:
379
+ st.info("Upload files using the sidebar to get started.")
380
+ st.write("""
381
+ How to use:
382
+ 1) Upload one or more images or PDFs
383
+ 2) Choose the OpenRouter vision model (Gemma 3 4B IT)
384
+ 3) Pick description or custom field extraction
385
+ 4) For PDFs, choose page-by-page or first page
386
+ 5) Click Process Files
387
+ 6) Review outputs and download CSVs
388
+ """)
389
+
390
+ st.markdown("---")
391
+ st.markdown(
392
+ """
393
+ <div style="text-align: center; margin-top: 12px; opacity: 0.7;">
394
+ Built for Hugging Face Spaces + OpenRouter
395
+ </div>
396
+ """,
397
+ unsafe_allow_html=True
398
+ )