AbhijitClemson commited on
Commit
0b71838
·
verified ·
1 Parent(s): 02304f6

Update src/pages/categorized/page6.py

Browse files
Files changed (1) hide show
  1. src/pages/categorized/page6.py +670 -670
src/pages/categorized/page6.py CHANGED
@@ -1,671 +1,671 @@
1
- import os
2
- import re
3
- import json
4
- import tempfile
5
- import zipfile
6
- from io import BytesIO
7
- import fitz # PyMuPDF
8
- import cv2
9
- import numpy as np
10
-
11
- import streamlit as st
12
- import pandas as pd
13
- import requests
14
- import base64
15
- from typing import Dict, Any, Optional
16
- from collections import defaultdict
17
-
18
- API_KEY = "AIzaSyCD5_sTXRhr4cpBrM08V7UhWNNc1KmaI9I"
19
- API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
20
-
21
- SCHEMA = {
22
- "type": "OBJECT",
23
- "properties": {
24
- "material_name": {"type": "STRING"},
25
- "material_abbreviation": {"type": "STRING"},
26
- "mechanical_properties": {
27
- "type": "ARRAY",
28
- "items": {
29
- "type": "OBJECT",
30
- "properties": {
31
- "section": {"type": "STRING"},
32
- "property_name": {"type": "STRING"},
33
- "value": {"type": "STRING"},
34
- "unit": {"type": "STRING"},
35
- "english": {"type": "STRING"},
36
- "test_condition": {"type": "STRING"},
37
- "comments": {"type": "STRING"}
38
- },
39
- "required": ["section", "property_name", "value", "english", "comments"]
40
- }
41
- }
42
- }
43
- }
44
-
45
- def make_abbreviation(name: str) -> str:
46
- """Create a simple abbreviation from the material name."""
47
- if not name:
48
- return "UNKNOWN"
49
- words = name.split()
50
- abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
51
- return abbr or name[:6].upper()
52
-
53
- DPI = 300
54
- CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
55
-
56
- def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
57
- """Calls Gemini API with PDF bytes"""
58
- try:
59
- encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
60
- mime_type = "application/pdf"
61
- except Exception as e:
62
- st.error(f"Error encoding PDF: {e}")
63
- return None
64
-
65
- prompt = (
66
- "You are an expert materials scientist. From the attached PDF, extract the material name, "
67
- "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
68
- "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
69
- "For each property, you MUST extract:\n"
70
- "- property_name\n- value (or range)\n- unit\n"
71
- "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
72
- "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
73
- "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
74
- )
75
-
76
- payload = {
77
- "contents": [{
78
- "parts": [
79
- {"text": prompt},
80
- {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
81
- ]
82
- }],
83
- "generationConfig": {
84
- "temperature": 0,
85
- "responseMimeType": "application/json",
86
- "responseSchema": SCHEMA
87
- }
88
- }
89
-
90
- try:
91
- r = requests.post(API_URL, json=payload, timeout=300)
92
- r.raise_for_status()
93
- data = r.json()
94
-
95
- candidates = data.get("candidates", [])
96
- if not candidates:
97
- return None
98
-
99
- parts = candidates[0].get("content", {}).get("parts", [])
100
- json_text = None
101
- for p in parts:
102
- t = p.get("text", "")
103
- if t.strip().startswith("{"):
104
- json_text = t
105
- break
106
-
107
- return json.loads(json_text) if json_text else None
108
- except Exception as e:
109
- st.error(f"Gemini API Error: {e}")
110
- return None
111
-
112
- def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
113
- """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
114
- mat_name = data.get("material_name", "") or ""
115
- mat_abbr = data.get("material_abbreviation", "") or ""
116
-
117
- if not mat_abbr:
118
- mat_abbr = make_abbreviation(mat_name)
119
-
120
- rows = []
121
- for item in data.get("mechanical_properties", []):
122
- rows.append({
123
- "material_name": mat_name,
124
- "material_abbreviation": mat_abbr,
125
- "section": item.get("section", "") or "Mechanical",
126
- "property_name": item.get("property_name", "") or "Unknown property",
127
- "value": item.get("value", "") or "N/A",
128
- "unit": item.get("unit", "") or "",
129
- "english": item.get("english", "") or "",
130
- "test_condition": item.get("test_condition", "") or "",
131
- "comments": item.get("comments", "") or "",
132
- })
133
- return pd.DataFrame(rows)
134
-
135
- # --- IMAGE EXTRACTION LOGIC ---
136
- def get_page_image(page):
137
- pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
138
- img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
139
- return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
140
-
141
- def is_valid_plot_geometry(binary_crop):
142
- h, w = binary_crop.shape
143
- if h < 100 or w < 100:
144
- return False
145
- ink_density = cv2.countNonZero(binary_crop) / (w * h)
146
- if ink_density > 0.35:
147
- return False
148
- h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 4, 1))
149
- v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 4))
150
- has_h = cv2.countNonZero(cv2.erode(binary_crop, h_kernel, iterations=1)) > 0
151
- has_v = cv2.countNonZero(cv2.erode(binary_crop, v_kernel, iterations=1)) > 0
152
- return has_h or has_v
153
-
154
- def merge_boxes(rects):
155
- if not rects:
156
- return []
157
- rects = sorted(rects, key=lambda r: r[2] * r[3], reverse=True)
158
- merged = []
159
- for r in rects:
160
- rx, ry, rw, rh = r
161
- if not any(rx >= m[0]-15 and ry >= m[1]-15 and rx+rw <= m[0]+m[2]+15 and ry+rh <= m[1]+m[3]+15 for m in merged):
162
- merged.append(r)
163
- return merged
164
-
165
- def extract_images(pdf_doc):
166
- """Extract plot images from PDF using improved logic"""
167
- grouped_data = defaultdict(lambda: {"page": 0, "image_data": []})
168
- PADDING = 30
169
-
170
- for page_num, page in enumerate(pdf_doc, start=1):
171
- img_bgr = get_page_image(page)
172
- gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
173
- _, binary = cv2.threshold(gray, 225, 255, cv2.THRESH_BINARY_INV)
174
- kernel = np.ones((10, 10), np.uint8)
175
- dilated = cv2.dilate(binary, kernel, iterations=1)
176
- contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
177
-
178
- candidates = []
179
- page_h, page_w = gray.shape
180
- for cnt in contours:
181
- x, y, w, h = cv2.boundingRect(cnt)
182
- if 0.03 < (w * h) / (page_w * page_h) < 0.8:
183
- if is_valid_plot_geometry(binary[y:y+h, x:x+w]):
184
- candidates.append((x, y, w, h))
185
-
186
- final_rects = merge_boxes(candidates)
187
- blocks = page.get_text("blocks")
188
-
189
- for (cx, cy, cw, ch) in final_rects:
190
- best_caption = f"Figure on Page {page_num} (Unlabeled)"
191
- min_dist = float('inf')
192
- for b in blocks:
193
- text = b[4].strip()
194
- if CAP_RE.match(text):
195
- cap_y = b[1] * (DPI/72)
196
- dist = cap_y - (cy + ch)
197
- if 0 < dist < (page_h * 0.3) and dist < min_dist:
198
- best_caption = text.replace('\n', ' ')
199
- min_dist = dist
200
-
201
- x1, y1 = max(0, cx - PADDING), max(0, cy - PADDING)
202
- x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
203
- crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
204
-
205
- # Store image data in memory instead of saving to disk
206
- _, buffer = cv2.imencode('.png', crop)
207
- img_bytes = buffer.tobytes()
208
-
209
- fname = f"pg{page_num}_{cx}_{cy}.png"
210
-
211
- grouped_data[best_caption]["page"] = page_num
212
- grouped_data[best_caption]["image_data"].append({
213
- "filename": fname,
214
- "bytes": img_bytes,
215
- "array": crop
216
- })
217
-
218
- results = [{"caption": k, "page": v["page"], "image_data": v["image_data"]} for k, v in grouped_data.items()]
219
- return results
220
-
221
- def create_zip(results, include_json=True):
222
- """Create a zip file with images and optional JSON"""
223
- buf = BytesIO()
224
- with zipfile.ZipFile(buf, "w") as z:
225
- if include_json:
226
- json_data = [{"caption": r["caption"], "page": r["page"],
227
- "image_count": len(r["image_data"])} for r in results]
228
- z.writestr("plot_data.json", json.dumps(json_data, indent=4))
229
-
230
- for item in results:
231
- for img_data in item['image_data']:
232
- z.writestr(img_data['filename'], img_data['bytes'])
233
-
234
- buf.seek(0)
235
- return buf.getvalue()
236
-
237
- def input_form():
238
- PROPERTY_CATEGORIES = {
239
- "Polymer": [
240
- "Thermal",
241
- "Mechanical",
242
- "Processing",
243
- "Physical",
244
- "Descriptive",
245
- ],
246
- "Fiber": [
247
- "Mechanical",
248
- "Physical",
249
- "Thermal",
250
- "Descriptive",
251
- ],
252
- "Composite": [
253
- "Mechanical",
254
- "Thermal",
255
- "Processing",
256
- "Physical",
257
- "Descriptive",
258
- "Composition / Reinforcement",
259
- "Architecture / Structure",
260
- ],
261
- }
262
-
263
- PROPERTY_NAMES = {
264
- "Polymer": {
265
- "Thermal": [
266
- "Glass transition temperature (Tg)",
267
- "Melting temperature (Tm)",
268
- "Crystallization temperature (Tc)",
269
- "Degree of crystallinity",
270
- "Decomposition temperature",
271
- ],
272
- "Mechanical": [
273
- "Tensile modulus",
274
- "Tensile strength",
275
- "Elongation at break",
276
- "Flexural modulus",
277
- "Impact strength",
278
- ],
279
- "Processing": [
280
- "Melt flow index (MFI)",
281
- "Processing temperature",
282
- "Cooling rate",
283
- "Mold shrinkage",
284
- ],
285
- "Physical": [
286
- "Density",
287
- "Specific gravity",
288
- ],
289
- "Descriptive": [
290
- "Material grade",
291
- "Manufacturer",
292
- ],
293
- },
294
-
295
- "Fiber": {
296
- "Mechanical": [
297
- "Tensile modulus",
298
- "Tensile strength",
299
- "Strain to failure",
300
- ],
301
- "Physical": [
302
- "Density",
303
- "Fiber diameter",
304
- ],
305
- "Thermal": [
306
- "Decomposition temperature",
307
- ],
308
- "Descriptive": [
309
- "Fiber type",
310
- "Surface treatment",
311
- ],
312
- },
313
-
314
- "Composite": {
315
- "Mechanical": [
316
- "Longitudinal modulus (E1)",
317
- "Transverse modulus (E2)",
318
- "Shear modulus (G12)",
319
- "Poissons ratio (V12)",
320
- "Tensile strength (fiber direction)",
321
- "Interlaminar shear strength",
322
- ],
323
- "Thermal": [
324
- "Glass transition temperature (matrix)",
325
- "Coefficient of thermal expansion (CTE)",
326
- ],
327
- "Processing": [
328
- "Curing temperature",
329
- "Curing pressure",
330
- ],
331
- "Physical": [
332
- "Density",
333
- ],
334
- "Descriptive": [
335
- "Laminate type",
336
- ],
337
- "Composition / Reinforcement": [
338
- "Fiber volume fraction",
339
- "Fiber weight fraction",
340
- "Fiber type",
341
- "Matrix type",
342
- ],
343
- "Architecture / Structure": [
344
- "Weave type",
345
- "Ply orientation",
346
- "Number of plies",
347
- "Stacking sequence",
348
- ],
349
- },
350
- }
351
-
352
- st.title("Materials Property Input Form")
353
-
354
- material_class = st.selectbox(
355
- "Select Material Class",
356
- ("Polymer", "Fiber", "Composite"),
357
- index=None,
358
- placeholder="Choose material class",
359
- )
360
-
361
- if material_class:
362
- property_category = st.selectbox(
363
- "Select Property Category",
364
- PROPERTY_CATEGORIES[material_class],
365
- index=None,
366
- placeholder="Choose property category",
367
- )
368
- else:
369
- property_category = None
370
-
371
- if material_class and property_category:
372
- property_name = st.selectbox(
373
- "Select Property",
374
- PROPERTY_NAMES[material_class][property_category],
375
- index=None,
376
- placeholder="Choose property",
377
- )
378
- else:
379
- property_name = None
380
-
381
- if material_class and property_category and property_name:
382
- with st.form("user_input"):
383
- st.subheader("Enter Data")
384
-
385
- material_name = st.text_input("Material Name")
386
- material_abbr = st.text_input("Material Abbreviation")
387
-
388
- value = st.text_input("Value")
389
- unit = st.text_input("Unit (SI)")
390
- english = st.text_input("English Units")
391
- test_condition = st.text_input("Test Condition")
392
- comments = st.text_area("Comments")
393
-
394
- submitted = st.form_submit_button("Submit")
395
-
396
- if submitted:
397
- if not (material_name and value):
398
- st.error("Material name and value are required.")
399
-
400
- else:
401
- Input_db = pd.DataFrame([{
402
- "material_class": material_class,
403
- "material_name": material_name,
404
- "material_abbreviation": material_abbr,
405
- "section": property_category,
406
- "property_name": property_name,
407
- "value": value,
408
- "unit": unit,
409
- "english_units": english,
410
- "test_condition": test_condition,
411
- "comments": comments
412
- }])
413
-
414
- st.success("Property added successfully")
415
- st.dataframe(Input_db)
416
-
417
- if "user_uploaded_data" not in st.session_state:
418
- st.session_state["user_uploaded_data"] = Input_db
419
- return
420
- else:
421
- st.session_state["user_uploaded_data"] = pd.concat(
422
- [st.session_state["user_uploaded_data"], Input_db],
423
- ignore_index=True
424
- )
425
-
426
- return
427
-
428
- def main():
429
- st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
430
-
431
-
432
- if 'image_results' not in st.session_state:
433
- st.session_state.image_results = []
434
- if 'pdf_processed' not in st.session_state:
435
- st.session_state.pdf_processed = False
436
- if 'current_pdf_name' not in st.session_state:
437
- st.session_state.current_pdf_name = None
438
- if 'form_submitted' not in st.session_state:
439
- st.session_state.form_submitted = False
440
- if 'pdf_data_extracted' not in st.session_state:
441
- st.session_state.pdf_data_extracted = False
442
- if 'pdf_extracted_df' not in st.session_state:
443
- st.session_state.pdf_extracted_df = pd.DataFrame()
444
-
445
-
446
- prev_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
447
- input_form()
448
- curr_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
449
-
450
- if curr_uploaded_count > prev_uploaded_count:
451
- st.session_state.form_submitted = True
452
-
453
- st.title("PDF Material Data & Plot Extractor")
454
-
455
- uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
456
-
457
- if not uploaded_file:
458
-
459
- st.info("Upload a PDF to extract material data and plots")
460
- st.session_state.pdf_processed = False
461
- st.session_state.current_pdf_name = None
462
- st.session_state.image_results = []
463
- st.session_state.form_submitted = False
464
- st.session_state.pdf_data_extracted = False
465
- st.session_state.pdf_extracted_df = pd.DataFrame()
466
- return
467
-
468
-
469
- paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
470
-
471
- if st.session_state.current_pdf_name != uploaded_file.name:
472
- st.session_state.pdf_processed = False
473
- st.session_state.current_pdf_name = uploaded_file.name
474
- st.session_state.image_results = []
475
- st.session_state.form_submitted = False
476
-
477
- if st.session_state.form_submitted:
478
- st.session_state.form_submitted = False
479
- st.info("A Form was submitted. But your previous extracted data has been added already. If you want to extract more data/plots" \
480
- "upload again")
481
- tab1, tab2 = st.tabs(["Material Data", "Extracted Plots"])
482
- with tab1:
483
- st.info("Material data from form has been added to database.")
484
- with tab2:
485
- st.info("Plots already extracted")
486
- return
487
-
488
- tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
489
-
490
- with tempfile.TemporaryDirectory() as tmpdir:
491
- pdf_path = os.path.join(tmpdir, uploaded_file.name)
492
- with open(pdf_path, "wb") as f:
493
- f.write(uploaded_file.getbuffer())
494
-
495
- with tab1:
496
- st.subheader("Material Properties Data")
497
-
498
- # Only call Gemini once per PDF
499
- if not st.session_state.pdf_data_extracted:
500
- with st.spinner(" Extracting material data..."):
501
- with open(pdf_path, "rb") as f:
502
- pdf_bytes = f.read()
503
-
504
- data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
505
-
506
- if data:
507
- df = convert_to_dataframe(data)
508
- if not df.empty:
509
- st.session_state.pdf_extracted_df = df
510
- st.session_state.pdf_data_extracted = True
511
- st.session_state.pdf_extracted_meta = data # optional: keep raw meta
512
- else:
513
- st.warning("No data extracted")
514
- else:
515
- st.error("Failed to extract data from PDF")
516
- # After extraction, or when rerunning, use stored data
517
- df = st.session_state.pdf_extracted_df
518
-
519
- if not df.empty:
520
- data = st.session_state.get("pdf_extracted_meta", {})
521
- st.success(f" Extracted {len(df)} properties")
522
-
523
- col1, col2 = st.columns(2)
524
- with col1:
525
- st.metric("Material", data.get("material_name", "N/A"))
526
- with col2:
527
- st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
528
-
529
- st.dataframe(df, use_container_width=True, height=400)
530
- st.subheader("Assign Material Category")
531
-
532
- extracted_material_class = st.selectbox(
533
- "Select category for this material",
534
- ["Polymer", "Fiber", "Composite"],
535
- index=None,
536
- placeholder="Required before adding to database"
537
- )
538
- if st.button(" Add to Database"):
539
- if not extracted_material_class:
540
- st.error("Please select a material category before adding.")
541
- else:
542
- df["material_class"] = extracted_material_class
543
- # Optional: add material_type for Page 1 filtering
544
- df["material_type"] = extracted_material_class
545
-
546
- if "user_uploaded_data" not in st.session_state:
547
- st.session_state["user_uploaded_data"] = df
548
- else:
549
- st.session_state["user_uploaded_data"] = pd.concat(
550
- [st.session_state["user_uploaded_data"], df],
551
- ignore_index=True
552
- )
553
-
554
- st.success(f"Added to {extracted_material_class} database!")
555
-
556
- csv = df.to_csv(index=False)
557
- st.download_button(
558
- "⬇ Download CSV",
559
- data=csv,
560
- file_name=f"{paper_id}_data.csv",
561
- mime="text/csv"
562
- )
563
-
564
-
565
- with tab2:
566
- st.subheader("Extracted Plot Images")
567
-
568
- if not st.session_state.pdf_processed:
569
- with st.spinner(" Extracting plots from PDF..."):
570
- doc = fitz.open(pdf_path)
571
- st.session_state.image_results = extract_images(doc)
572
- doc.close()
573
- st.session_state.pdf_processed = True
574
-
575
- if st.session_state.image_results:
576
- subtab1, subtab2 = st.tabs([" Images", " JSON Preview"])
577
-
578
- with subtab1:
579
- st.success(f" Extracted {len(st.session_state.image_results)} plots")
580
-
581
- col_img, col_json, col_all = st.columns(3)
582
-
583
- with col_img:
584
- img_zip = create_zip(st.session_state.image_results, include_json=False)
585
- st.download_button(
586
- " Download Images Only",
587
- data=img_zip,
588
- file_name=f"{paper_id}_images.zip",
589
- mime="application/zip",
590
- use_container_width=True,
591
- key="download_images"
592
- )
593
-
594
- with col_json:
595
- json_data = [{"caption": r["caption"], "page": r["page"],
596
- "image_count": len(r["image_data"])} for r in st.session_state.image_results]
597
- st.download_button(
598
- " Download JSON",
599
- data=json.dumps(json_data, indent=4),
600
- file_name=f"{paper_id}_metadata.json",
601
- mime="application/json",
602
- use_container_width=True,
603
- key="download_json_top"
604
- )
605
-
606
- with col_all:
607
- full_zip = create_zip(st.session_state.image_results, include_json=True)
608
- st.download_button(
609
- " Download All",
610
- data=full_zip,
611
- file_name=f"{paper_id}_complete.zip",
612
- mime="application/zip",
613
- use_container_width=True,
614
- key="download_all"
615
- )
616
-
617
- st.divider()
618
-
619
- results_copy = st.session_state.image_results.copy()
620
-
621
- for idx in range(len(results_copy)):
622
- if idx >= len(st.session_state.image_results):
623
- break
624
-
625
- r = st.session_state.image_results[idx]
626
-
627
- with st.container(border=True):
628
- col_cap, col_btn = st.columns([0.85, 0.15])
629
- col_cap.markdown(f"**Page {r['page']}** {r['caption']}")
630
-
631
- if col_btn.button(" Delete", key=f"del_g_{idx}_{r['page']}"):
632
- del st.session_state.image_results[idx]
633
- st.rerun()
634
-
635
- image_data_list = r['image_data']
636
- if image_data_list and len(image_data_list) > 0:
637
- cols = st.columns(len(image_data_list))
638
- for p_idx in range(len(image_data_list)):
639
- if p_idx >= len(st.session_state.image_results[idx]['image_data']):
640
- break
641
-
642
- img_data = st.session_state.image_results[idx]['image_data'][p_idx]
643
- with cols[p_idx]:
644
- st.image(img_data['array'], width=img_width, channels="BGR")
645
- if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
646
- del st.session_state.image_results[idx]['image_data'][p_idx]
647
- if len(st.session_state.image_results[idx]['image_data']) == 0:
648
- del st.session_state.image_results[idx]
649
- st.rerun()
650
-
651
- with subtab2:
652
- st.subheader("Metadata Preview")
653
- json_data = [{"caption": r["caption"], "page": r["page"],
654
- "image_count": len(r["image_data"]),
655
- "images": [img["filename"] for img in r["image_data"]]}
656
- for r in st.session_state.image_results]
657
-
658
- st.download_button(
659
- " Download JSON",
660
- data=json.dumps(json_data, indent=4),
661
- file_name=f"{paper_id}_metadata.json",
662
- mime="application/json",
663
- key="download_json_bottom"
664
- )
665
-
666
- st.json(json_data)
667
- else:
668
- st.warning("No plots found in PDF")
669
-
670
- if __name__ == "__main__":
671
  main()
 
1
+ import os
2
+ import re
3
+ import json
4
+ import tempfile
5
+ import zipfile
6
+ from io import BytesIO
7
+ import fitz # PyMuPDF
8
+ import cv2
9
+ import numpy as np
10
+
11
+ import streamlit as st
12
+ import pandas as pd
13
+ import requests
14
+ import base64
15
+ from typing import Dict, Any, Optional
16
+ from collections import defaultdict
17
+
18
+ API_KEY = "AIzaSyD7g_cMfX88XozEeFivpXU5dPoKBAymh2I"
19
+ API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
20
+
21
+ SCHEMA = {
22
+ "type": "OBJECT",
23
+ "properties": {
24
+ "material_name": {"type": "STRING"},
25
+ "material_abbreviation": {"type": "STRING"},
26
+ "mechanical_properties": {
27
+ "type": "ARRAY",
28
+ "items": {
29
+ "type": "OBJECT",
30
+ "properties": {
31
+ "section": {"type": "STRING"},
32
+ "property_name": {"type": "STRING"},
33
+ "value": {"type": "STRING"},
34
+ "unit": {"type": "STRING"},
35
+ "english": {"type": "STRING"},
36
+ "test_condition": {"type": "STRING"},
37
+ "comments": {"type": "STRING"}
38
+ },
39
+ "required": ["section", "property_name", "value", "english", "comments"]
40
+ }
41
+ }
42
+ }
43
+ }
44
+
45
+ def make_abbreviation(name: str) -> str:
46
+ """Create a simple abbreviation from the material name."""
47
+ if not name:
48
+ return "UNKNOWN"
49
+ words = name.split()
50
+ abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
51
+ return abbr or name[:6].upper()
52
+
53
+ DPI = 300
54
+ CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
55
+
56
+ def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
57
+ """Calls Gemini API with PDF bytes"""
58
+ try:
59
+ encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
60
+ mime_type = "application/pdf"
61
+ except Exception as e:
62
+ st.error(f"Error encoding PDF: {e}")
63
+ return None
64
+
65
+ prompt = (
66
+ "You are an expert materials scientist. From the attached PDF, extract the material name, "
67
+ "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
68
+ "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
69
+ "For each property, you MUST extract:\n"
70
+ "- property_name\n- value (or range)\n- unit\n"
71
+ "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
72
+ "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
73
+ "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
74
+ )
75
+
76
+ payload = {
77
+ "contents": [{
78
+ "parts": [
79
+ {"text": prompt},
80
+ {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
81
+ ]
82
+ }],
83
+ "generationConfig": {
84
+ "temperature": 0,
85
+ "responseMimeType": "application/json",
86
+ "responseSchema": SCHEMA
87
+ }
88
+ }
89
+
90
+ try:
91
+ r = requests.post(API_URL, json=payload, timeout=300)
92
+ r.raise_for_status()
93
+ data = r.json()
94
+
95
+ candidates = data.get("candidates", [])
96
+ if not candidates:
97
+ return None
98
+
99
+ parts = candidates[0].get("content", {}).get("parts", [])
100
+ json_text = None
101
+ for p in parts:
102
+ t = p.get("text", "")
103
+ if t.strip().startswith("{"):
104
+ json_text = t
105
+ break
106
+
107
+ return json.loads(json_text) if json_text else None
108
+ except Exception as e:
109
+ st.error(f"Gemini API Error: {e}")
110
+ return None
111
+
112
+ def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
113
+ """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
114
+ mat_name = data.get("material_name", "") or ""
115
+ mat_abbr = data.get("material_abbreviation", "") or ""
116
+
117
+ if not mat_abbr:
118
+ mat_abbr = make_abbreviation(mat_name)
119
+
120
+ rows = []
121
+ for item in data.get("mechanical_properties", []):
122
+ rows.append({
123
+ "material_name": mat_name,
124
+ "material_abbreviation": mat_abbr,
125
+ "section": item.get("section", "") or "Mechanical",
126
+ "property_name": item.get("property_name", "") or "Unknown property",
127
+ "value": item.get("value", "") or "N/A",
128
+ "unit": item.get("unit", "") or "",
129
+ "english": item.get("english", "") or "",
130
+ "test_condition": item.get("test_condition", "") or "",
131
+ "comments": item.get("comments", "") or "",
132
+ })
133
+ return pd.DataFrame(rows)
134
+
135
+ # --- IMAGE EXTRACTION LOGIC ---
136
+ def get_page_image(page):
137
+ pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
138
+ img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
139
+ return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
140
+
141
+ def is_valid_plot_geometry(binary_crop):
142
+ h, w = binary_crop.shape
143
+ if h < 100 or w < 100:
144
+ return False
145
+ ink_density = cv2.countNonZero(binary_crop) / (w * h)
146
+ if ink_density > 0.35:
147
+ return False
148
+ h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 4, 1))
149
+ v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 4))
150
+ has_h = cv2.countNonZero(cv2.erode(binary_crop, h_kernel, iterations=1)) > 0
151
+ has_v = cv2.countNonZero(cv2.erode(binary_crop, v_kernel, iterations=1)) > 0
152
+ return has_h or has_v
153
+
154
+ def merge_boxes(rects):
155
+ if not rects:
156
+ return []
157
+ rects = sorted(rects, key=lambda r: r[2] * r[3], reverse=True)
158
+ merged = []
159
+ for r in rects:
160
+ rx, ry, rw, rh = r
161
+ if not any(rx >= m[0]-15 and ry >= m[1]-15 and rx+rw <= m[0]+m[2]+15 and ry+rh <= m[1]+m[3]+15 for m in merged):
162
+ merged.append(r)
163
+ return merged
164
+
165
+ def extract_images(pdf_doc):
166
+ """Extract plot images from PDF using improved logic"""
167
+ grouped_data = defaultdict(lambda: {"page": 0, "image_data": []})
168
+ PADDING = 30
169
+
170
+ for page_num, page in enumerate(pdf_doc, start=1):
171
+ img_bgr = get_page_image(page)
172
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
173
+ _, binary = cv2.threshold(gray, 225, 255, cv2.THRESH_BINARY_INV)
174
+ kernel = np.ones((10, 10), np.uint8)
175
+ dilated = cv2.dilate(binary, kernel, iterations=1)
176
+ contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
177
+
178
+ candidates = []
179
+ page_h, page_w = gray.shape
180
+ for cnt in contours:
181
+ x, y, w, h = cv2.boundingRect(cnt)
182
+ if 0.03 < (w * h) / (page_w * page_h) < 0.8:
183
+ if is_valid_plot_geometry(binary[y:y+h, x:x+w]):
184
+ candidates.append((x, y, w, h))
185
+
186
+ final_rects = merge_boxes(candidates)
187
+ blocks = page.get_text("blocks")
188
+
189
+ for (cx, cy, cw, ch) in final_rects:
190
+ best_caption = f"Figure on Page {page_num} (Unlabeled)"
191
+ min_dist = float('inf')
192
+ for b in blocks:
193
+ text = b[4].strip()
194
+ if CAP_RE.match(text):
195
+ cap_y = b[1] * (DPI/72)
196
+ dist = cap_y - (cy + ch)
197
+ if 0 < dist < (page_h * 0.3) and dist < min_dist:
198
+ best_caption = text.replace('\n', ' ')
199
+ min_dist = dist
200
+
201
+ x1, y1 = max(0, cx - PADDING), max(0, cy - PADDING)
202
+ x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
203
+ crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
204
+
205
+ # Store image data in memory instead of saving to disk
206
+ _, buffer = cv2.imencode('.png', crop)
207
+ img_bytes = buffer.tobytes()
208
+
209
+ fname = f"pg{page_num}_{cx}_{cy}.png"
210
+
211
+ grouped_data[best_caption]["page"] = page_num
212
+ grouped_data[best_caption]["image_data"].append({
213
+ "filename": fname,
214
+ "bytes": img_bytes,
215
+ "array": crop
216
+ })
217
+
218
+ results = [{"caption": k, "page": v["page"], "image_data": v["image_data"]} for k, v in grouped_data.items()]
219
+ return results
220
+
221
+ def create_zip(results, include_json=True):
222
+ """Create a zip file with images and optional JSON"""
223
+ buf = BytesIO()
224
+ with zipfile.ZipFile(buf, "w") as z:
225
+ if include_json:
226
+ json_data = [{"caption": r["caption"], "page": r["page"],
227
+ "image_count": len(r["image_data"])} for r in results]
228
+ z.writestr("plot_data.json", json.dumps(json_data, indent=4))
229
+
230
+ for item in results:
231
+ for img_data in item['image_data']:
232
+ z.writestr(img_data['filename'], img_data['bytes'])
233
+
234
+ buf.seek(0)
235
+ return buf.getvalue()
236
+
237
+ def input_form():
238
+ PROPERTY_CATEGORIES = {
239
+ "Polymer": [
240
+ "Thermal",
241
+ "Mechanical",
242
+ "Processing",
243
+ "Physical",
244
+ "Descriptive",
245
+ ],
246
+ "Fiber": [
247
+ "Mechanical",
248
+ "Physical",
249
+ "Thermal",
250
+ "Descriptive",
251
+ ],
252
+ "Composite": [
253
+ "Mechanical",
254
+ "Thermal",
255
+ "Processing",
256
+ "Physical",
257
+ "Descriptive",
258
+ "Composition / Reinforcement",
259
+ "Architecture / Structure",
260
+ ],
261
+ }
262
+
263
+ PROPERTY_NAMES = {
264
+ "Polymer": {
265
+ "Thermal": [
266
+ "Glass transition temperature (Tg)",
267
+ "Melting temperature (Tm)",
268
+ "Crystallization temperature (Tc)",
269
+ "Degree of crystallinity",
270
+ "Decomposition temperature",
271
+ ],
272
+ "Mechanical": [
273
+ "Tensile modulus",
274
+ "Tensile strength",
275
+ "Elongation at break",
276
+ "Flexural modulus",
277
+ "Impact strength",
278
+ ],
279
+ "Processing": [
280
+ "Melt flow index (MFI)",
281
+ "Processing temperature",
282
+ "Cooling rate",
283
+ "Mold shrinkage",
284
+ ],
285
+ "Physical": [
286
+ "Density",
287
+ "Specific gravity",
288
+ ],
289
+ "Descriptive": [
290
+ "Material grade",
291
+ "Manufacturer",
292
+ ],
293
+ },
294
+
295
+ "Fiber": {
296
+ "Mechanical": [
297
+ "Tensile modulus",
298
+ "Tensile strength",
299
+ "Strain to failure",
300
+ ],
301
+ "Physical": [
302
+ "Density",
303
+ "Fiber diameter",
304
+ ],
305
+ "Thermal": [
306
+ "Decomposition temperature",
307
+ ],
308
+ "Descriptive": [
309
+ "Fiber type",
310
+ "Surface treatment",
311
+ ],
312
+ },
313
+
314
+ "Composite": {
315
+ "Mechanical": [
316
+ "Longitudinal modulus (E1)",
317
+ "Transverse modulus (E2)",
318
+ "Shear modulus (G12)",
319
+ "Poissons ratio (V12)",
320
+ "Tensile strength (fiber direction)",
321
+ "Interlaminar shear strength",
322
+ ],
323
+ "Thermal": [
324
+ "Glass transition temperature (matrix)",
325
+ "Coefficient of thermal expansion (CTE)",
326
+ ],
327
+ "Processing": [
328
+ "Curing temperature",
329
+ "Curing pressure",
330
+ ],
331
+ "Physical": [
332
+ "Density",
333
+ ],
334
+ "Descriptive": [
335
+ "Laminate type",
336
+ ],
337
+ "Composition / Reinforcement": [
338
+ "Fiber volume fraction",
339
+ "Fiber weight fraction",
340
+ "Fiber type",
341
+ "Matrix type",
342
+ ],
343
+ "Architecture / Structure": [
344
+ "Weave type",
345
+ "Ply orientation",
346
+ "Number of plies",
347
+ "Stacking sequence",
348
+ ],
349
+ },
350
+ }
351
+
352
+ st.title("Materials Property Input Form")
353
+
354
+ material_class = st.selectbox(
355
+ "Select Material Class",
356
+ ("Polymer", "Fiber", "Composite"),
357
+ index=None,
358
+ placeholder="Choose material class",
359
+ )
360
+
361
+ if material_class:
362
+ property_category = st.selectbox(
363
+ "Select Property Category",
364
+ PROPERTY_CATEGORIES[material_class],
365
+ index=None,
366
+ placeholder="Choose property category",
367
+ )
368
+ else:
369
+ property_category = None
370
+
371
+ if material_class and property_category:
372
+ property_name = st.selectbox(
373
+ "Select Property",
374
+ PROPERTY_NAMES[material_class][property_category],
375
+ index=None,
376
+ placeholder="Choose property",
377
+ )
378
+ else:
379
+ property_name = None
380
+
381
+ if material_class and property_category and property_name:
382
+ with st.form("user_input"):
383
+ st.subheader("Enter Data")
384
+
385
+ material_name = st.text_input("Material Name")
386
+ material_abbr = st.text_input("Material Abbreviation")
387
+
388
+ value = st.text_input("Value")
389
+ unit = st.text_input("Unit (SI)")
390
+ english = st.text_input("English Units")
391
+ test_condition = st.text_input("Test Condition")
392
+ comments = st.text_area("Comments")
393
+
394
+ submitted = st.form_submit_button("Submit")
395
+
396
+ if submitted:
397
+ if not (material_name and value):
398
+ st.error("Material name and value are required.")
399
+
400
+ else:
401
+ Input_db = pd.DataFrame([{
402
+ "material_class": material_class,
403
+ "material_name": material_name,
404
+ "material_abbreviation": material_abbr,
405
+ "section": property_category,
406
+ "property_name": property_name,
407
+ "value": value,
408
+ "unit": unit,
409
+ "english_units": english,
410
+ "test_condition": test_condition,
411
+ "comments": comments
412
+ }])
413
+
414
+ st.success("Property added successfully")
415
+ st.dataframe(Input_db)
416
+
417
+ if "user_uploaded_data" not in st.session_state:
418
+ st.session_state["user_uploaded_data"] = Input_db
419
+ return
420
+ else:
421
+ st.session_state["user_uploaded_data"] = pd.concat(
422
+ [st.session_state["user_uploaded_data"], Input_db],
423
+ ignore_index=True
424
+ )
425
+
426
+ return
427
+
428
+ def main():
429
+ st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
430
+
431
+
432
+ if 'image_results' not in st.session_state:
433
+ st.session_state.image_results = []
434
+ if 'pdf_processed' not in st.session_state:
435
+ st.session_state.pdf_processed = False
436
+ if 'current_pdf_name' not in st.session_state:
437
+ st.session_state.current_pdf_name = None
438
+ if 'form_submitted' not in st.session_state:
439
+ st.session_state.form_submitted = False
440
+ if 'pdf_data_extracted' not in st.session_state:
441
+ st.session_state.pdf_data_extracted = False
442
+ if 'pdf_extracted_df' not in st.session_state:
443
+ st.session_state.pdf_extracted_df = pd.DataFrame()
444
+
445
+
446
+ prev_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
447
+ input_form()
448
+ curr_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
449
+
450
+ if curr_uploaded_count > prev_uploaded_count:
451
+ st.session_state.form_submitted = True
452
+
453
+ st.title("PDF Material Data & Plot Extractor")
454
+
455
+ uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
456
+
457
+ if not uploaded_file:
458
+
459
+ st.info("Upload a PDF to extract material data and plots")
460
+ st.session_state.pdf_processed = False
461
+ st.session_state.current_pdf_name = None
462
+ st.session_state.image_results = []
463
+ st.session_state.form_submitted = False
464
+ st.session_state.pdf_data_extracted = False
465
+ st.session_state.pdf_extracted_df = pd.DataFrame()
466
+ return
467
+
468
+
469
+ paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
470
+
471
+ if st.session_state.current_pdf_name != uploaded_file.name:
472
+ st.session_state.pdf_processed = False
473
+ st.session_state.current_pdf_name = uploaded_file.name
474
+ st.session_state.image_results = []
475
+ st.session_state.form_submitted = False
476
+
477
+ if st.session_state.form_submitted:
478
+ st.session_state.form_submitted = False
479
+ st.info("A Form was submitted. But your previous extracted data has been added already. If you want to extract more data/plots" \
480
+ "upload again")
481
+ tab1, tab2 = st.tabs(["Material Data", "Extracted Plots"])
482
+ with tab1:
483
+ st.info("Material data from form has been added to database.")
484
+ with tab2:
485
+ st.info("Plots already extracted")
486
+ return
487
+
488
+ tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
489
+
490
+ with tempfile.TemporaryDirectory() as tmpdir:
491
+ pdf_path = os.path.join(tmpdir, uploaded_file.name)
492
+ with open(pdf_path, "wb") as f:
493
+ f.write(uploaded_file.getbuffer())
494
+
495
+ with tab1:
496
+ st.subheader("Material Properties Data")
497
+
498
+ # Only call Gemini once per PDF
499
+ if not st.session_state.pdf_data_extracted:
500
+ with st.spinner(" Extracting material data..."):
501
+ with open(pdf_path, "rb") as f:
502
+ pdf_bytes = f.read()
503
+
504
+ data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
505
+
506
+ if data:
507
+ df = convert_to_dataframe(data)
508
+ if not df.empty:
509
+ st.session_state.pdf_extracted_df = df
510
+ st.session_state.pdf_data_extracted = True
511
+ st.session_state.pdf_extracted_meta = data # optional: keep raw meta
512
+ else:
513
+ st.warning("No data extracted")
514
+ else:
515
+ st.error("Failed to extract data from PDF")
516
+ # After extraction, or when rerunning, use stored data
517
+ df = st.session_state.pdf_extracted_df
518
+
519
+ if not df.empty:
520
+ data = st.session_state.get("pdf_extracted_meta", {})
521
+ st.success(f" Extracted {len(df)} properties")
522
+
523
+ col1, col2 = st.columns(2)
524
+ with col1:
525
+ st.metric("Material", data.get("material_name", "N/A"))
526
+ with col2:
527
+ st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
528
+
529
+ st.dataframe(df, use_container_width=True, height=400)
530
+ st.subheader("Assign Material Category")
531
+
532
+ extracted_material_class = st.selectbox(
533
+ "Select category for this material",
534
+ ["Polymer", "Fiber", "Composite"],
535
+ index=None,
536
+ placeholder="Required before adding to database"
537
+ )
538
+ if st.button(" Add to Database"):
539
+ if not extracted_material_class:
540
+ st.error("Please select a material category before adding.")
541
+ else:
542
+ df["material_class"] = extracted_material_class
543
+ # Optional: add material_type for Page 1 filtering
544
+ df["material_type"] = extracted_material_class
545
+
546
+ if "user_uploaded_data" not in st.session_state:
547
+ st.session_state["user_uploaded_data"] = df
548
+ else:
549
+ st.session_state["user_uploaded_data"] = pd.concat(
550
+ [st.session_state["user_uploaded_data"], df],
551
+ ignore_index=True
552
+ )
553
+
554
+ st.success(f"Added to {extracted_material_class} database!")
555
+
556
+ csv = df.to_csv(index=False)
557
+ st.download_button(
558
+ "⬇ Download CSV",
559
+ data=csv,
560
+ file_name=f"{paper_id}_data.csv",
561
+ mime="text/csv"
562
+ )
563
+
564
+
565
+ with tab2:
566
+ st.subheader("Extracted Plot Images")
567
+
568
+ if not st.session_state.pdf_processed:
569
+ with st.spinner(" Extracting plots from PDF..."):
570
+ doc = fitz.open(pdf_path)
571
+ st.session_state.image_results = extract_images(doc)
572
+ doc.close()
573
+ st.session_state.pdf_processed = True
574
+
575
+ if st.session_state.image_results:
576
+ subtab1, subtab2 = st.tabs([" Images", " JSON Preview"])
577
+
578
+ with subtab1:
579
+ st.success(f" Extracted {len(st.session_state.image_results)} plots")
580
+
581
+ col_img, col_json, col_all = st.columns(3)
582
+
583
+ with col_img:
584
+ img_zip = create_zip(st.session_state.image_results, include_json=False)
585
+ st.download_button(
586
+ " Download Images Only",
587
+ data=img_zip,
588
+ file_name=f"{paper_id}_images.zip",
589
+ mime="application/zip",
590
+ use_container_width=True,
591
+ key="download_images"
592
+ )
593
+
594
+ with col_json:
595
+ json_data = [{"caption": r["caption"], "page": r["page"],
596
+ "image_count": len(r["image_data"])} for r in st.session_state.image_results]
597
+ st.download_button(
598
+ " Download JSON",
599
+ data=json.dumps(json_data, indent=4),
600
+ file_name=f"{paper_id}_metadata.json",
601
+ mime="application/json",
602
+ use_container_width=True,
603
+ key="download_json_top"
604
+ )
605
+
606
+ with col_all:
607
+ full_zip = create_zip(st.session_state.image_results, include_json=True)
608
+ st.download_button(
609
+ " Download All",
610
+ data=full_zip,
611
+ file_name=f"{paper_id}_complete.zip",
612
+ mime="application/zip",
613
+ use_container_width=True,
614
+ key="download_all"
615
+ )
616
+
617
+ st.divider()
618
+
619
+ results_copy = st.session_state.image_results.copy()
620
+
621
+ for idx in range(len(results_copy)):
622
+ if idx >= len(st.session_state.image_results):
623
+ break
624
+
625
+ r = st.session_state.image_results[idx]
626
+
627
+ with st.container(border=True):
628
+ col_cap, col_btn = st.columns([0.85, 0.15])
629
+ col_cap.markdown(f"**Page {r['page']}** {r['caption']}")
630
+
631
+ if col_btn.button(" Delete", key=f"del_g_{idx}_{r['page']}"):
632
+ del st.session_state.image_results[idx]
633
+ st.rerun()
634
+
635
+ image_data_list = r['image_data']
636
+ if image_data_list and len(image_data_list) > 0:
637
+ cols = st.columns(len(image_data_list))
638
+ for p_idx in range(len(image_data_list)):
639
+ if p_idx >= len(st.session_state.image_results[idx]['image_data']):
640
+ break
641
+
642
+ img_data = st.session_state.image_results[idx]['image_data'][p_idx]
643
+ with cols[p_idx]:
644
+ st.image(img_data['array'], width=img_width, channels="BGR")
645
+ if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
646
+ del st.session_state.image_results[idx]['image_data'][p_idx]
647
+ if len(st.session_state.image_results[idx]['image_data']) == 0:
648
+ del st.session_state.image_results[idx]
649
+ st.rerun()
650
+
651
+ with subtab2:
652
+ st.subheader("Metadata Preview")
653
+ json_data = [{"caption": r["caption"], "page": r["page"],
654
+ "image_count": len(r["image_data"]),
655
+ "images": [img["filename"] for img in r["image_data"]]}
656
+ for r in st.session_state.image_results]
657
+
658
+ st.download_button(
659
+ " Download JSON",
660
+ data=json.dumps(json_data, indent=4),
661
+ file_name=f"{paper_id}_metadata.json",
662
+ mime="application/json",
663
+ key="download_json_bottom"
664
+ )
665
+
666
+ st.json(json_data)
667
+ else:
668
+ st.warning("No plots found in PDF")
669
+
670
+ if __name__ == "__main__":
671
  main()