gvlktejaswi commited on
Commit
b443b15
·
verified ·
1 Parent(s): 778dfb3

Delete src/pages

Browse files
src/pages/3_Categorized_Search.py DELETED
@@ -1,34 +0,0 @@
1
- import streamlit as st
2
- from PIL import Image # Used to open and handle image files
3
-
4
-
5
-
6
- def load_page1():
7
- from pages.categorized.page1 import main
8
- main()
9
-
10
- # def load_page2():
11
- # from pages.categorized.page2 import main
12
- # main()
13
-
14
-
15
-
16
- load_page1()
17
-
18
-
19
- #st.sidebar.button('Material Type', on_click=load_page1)
20
- #st.sidebar.button('Trade Name', on_click=load_page2)
21
- #st.sidebar.button('Manufacturer Name', on_click=load_page3)
22
-
23
- #image = Image.open('logo.png')
24
- #st.image(image, caption='a', use_container_width=True)
25
- st.sidebar.write("")
26
- st.sidebar.write("")
27
- st.sidebar.write("")
28
- st.sidebar.write("")
29
- st.sidebar.write("")
30
- st.sidebar.write("")
31
- st.sidebar.write("")
32
- st.sidebar.write("")
33
- st.sidebar.image("logo.png", caption=" ", width=150)
34
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/5_Upload_Data.py DELETED
@@ -1,18 +0,0 @@
1
- import streamlit as st
2
- from PIL import Image
3
-
4
-
5
- # def load_page1():
6
- # from pages.categorized.page1 import main
7
- # main()
8
-
9
- def load_page6():
10
- from pages.categorized.page6 import main
11
- main()
12
-
13
- def load_page3():
14
- from pages.categorized.page3 import main
15
- main()
16
-
17
- load_page6()
18
- #load_page3()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/5_Upload_Data.py DELETED
@@ -1,18 +0,0 @@
1
- import streamlit as st
2
- from PIL import Image
3
-
4
-
5
- # def load_page1():
6
- # from pages.categorized.page1 import main
7
- # main()
8
-
9
- def load_page6():
10
- from pages.pages.categorized.page6 import main
11
- main()
12
-
13
- def load_page3():
14
- from pages.pages.categorized.page3 import main
15
- main()
16
-
17
- load_page6()
18
- #load_page3()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/categorized/Backend/Pdf_DataExtraction.py DELETED
@@ -1,120 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- from PIL import Image
4
- import requests
5
- import base64
6
- import json
7
- import os
8
- from typing import Dict, Any, Optional
9
-
10
-
11
-
12
-
13
- # Backend PDF extraction Logic
14
- API_KEY = "AIzaSyAruLR2WyiaL9PquOXOhHF4wMn7tfYZWek"
15
- API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
16
-
17
- SCHEMA = {
18
- "type": "OBJECT",
19
- "properties": {
20
- "material_name": {"type": "STRING"},
21
- "material_abbreviation": {"type": "STRING"},
22
- "mechanical_properties": {
23
- "type": "ARRAY",
24
- "items": {
25
- "type": "OBJECT",
26
- "properties": {
27
- "section": {"type": "STRING"},
28
- "property_name": {"type": "STRING"},
29
- "value": {"type": "STRING"},
30
- "unit": {"type": "STRING"},
31
- "english": {"type": "STRING"},
32
- "test_condition": {"type": "STRING"},
33
- "comments": {"type": "STRING"}
34
- },
35
- "required": ["section", "property_name", "value", "english", "comments"]
36
- }
37
- }
38
- }
39
- }
40
-
41
- # === GEMINI CALL FUNCTION ===
42
- def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
43
- """Calls Gemini API with PDF bytes"""
44
- try:
45
- encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
46
- mime_type = "application/pdf"
47
- except Exception as e:
48
- st.error(f"Error encoding PDF: {e}")
49
- return None
50
-
51
- prompt = (
52
- "Extract all experimental data from this research paper. "
53
- "For each measurement, extract: "
54
- "- experiment_name, measured_value, unit, uncertainty, method, conditions. "
55
- "Return as JSON."
56
- # "You are an expert materials scientist. From the attached PDF, extract the material name, "
57
- # "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
58
- # "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
59
- # "For each property, you MUST extract:\n"
60
- # "- property_name\n- value (or range)\n- unit\n"
61
- # "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
62
- # "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
63
- # "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
64
- )
65
-
66
- payload = {
67
- "contents": [
68
- {
69
- "parts": [
70
- {"text": prompt},
71
- {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
72
- ]
73
- }
74
- ],
75
- "generationConfig": {
76
- "temperature": 0,
77
- "responseMimeType": "application/json",
78
- "responseSchema": SCHEMA
79
- }
80
- }
81
-
82
- try:
83
- r = requests.post(API_URL, json=payload, timeout=300)
84
- r.raise_for_status()
85
- data = r.json()
86
-
87
- candidates = data.get("candidates", [])
88
- if not candidates:
89
- return None
90
-
91
- parts = candidates[0].get("content", {}).get("parts", [])
92
- json_text = None
93
- for p in parts:
94
- t = p.get("text", "")
95
- if t.strip().startswith("{"):
96
- json_text = t
97
- break
98
-
99
- return json.loads(json_text) if json_text else None
100
- except Exception as e:
101
- st.error(f"Gemini API Error: {e}")
102
- return None
103
-
104
-
105
- def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
106
- """Convert extracted JSON to DataFrame"""
107
- rows = []
108
- for item in data.get("mechanical_properties", []):
109
- rows.append({
110
- "material_name": data.get("material_name", ""),
111
- "material_abbreviation": data.get("material_abbreviation", ""),
112
- "section": item.get("section", ""),
113
- "property_name": item.get("property_name", ""),
114
- "value": item.get("value", ""),
115
- "unit": item.get("unit", ""),
116
- "english": item.get("english", ""),
117
- "test_condition": item.get("test_condition", ""),
118
- "comments": item.get("comments", "")
119
- })
120
- return pd.DataFrame(rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/categorized/Backend/Pdf_ImageExtraction.py DELETED
@@ -1,390 +0,0 @@
1
- import os
2
- import re
3
- import json
4
- import math
5
- import tempfile
6
- import fitz # PyMuPDF
7
- import cv2
8
- import numpy as np
9
- from PIL import Image
10
- import streamlit as st
11
-
12
- # -------------------
13
- # Config
14
- # -------------------
15
- DPI = 300
16
- OUT_DIR = "outputs"
17
-
18
- KEEP_ONLY_STRESS_STRAIN = False
19
-
20
- CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
21
- SS_KW = re.compile(
22
- r"(stress\s*[-–]?\s*strain|stress|strain|tensile|MPa|GPa|kN|yield|elongation)",
23
- re.IGNORECASE
24
- )
25
-
26
- # -------------------
27
- # Render helpers
28
- # -------------------
29
- def render_page(page, dpi=DPI):
30
- mat = fitz.Matrix(dpi/72, dpi/72)
31
- pix = page.get_pixmap(matrix=mat, alpha=False)
32
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
33
- return img, mat
34
-
35
- def pdf_to_px_bbox(bbox_pdf, mat):
36
- x0, y0, x1, y1 = bbox_pdf
37
- sx, sy = mat.a, mat.d
38
- return (int(float(x0) * sx), int(float(y0) * sy), int(float(x1) * sx), int(float(y1) * sy))
39
-
40
- def safe_crop_px(pil_img, box):
41
- if not isinstance(box, (tuple, list)):
42
- return None
43
- if len(box) == 1 and isinstance(box[0], (tuple, list)) and len(box[0]) == 4:
44
- box = box[0]
45
- if len(box) != 4:
46
- return None
47
-
48
- x0, y0, x1, y1 = box
49
- if any(isinstance(v, (tuple, list)) for v in (x0, y0, x1, y1)):
50
- return None
51
-
52
- try:
53
- x0 = int(x0)
54
- y0 = int(y0)
55
- x1 = int(x1)
56
- y1 = int(y1)
57
- except (TypeError, ValueError):
58
- return None
59
-
60
- if x1 < x0:
61
- x0, x1 = x1, x0
62
- if y1 < y0:
63
- y0, y1 = y1, y0
64
-
65
- W, H = pil_img.size
66
- x0 = max(0, min(W, x0))
67
- x1 = max(0, min(W, x1))
68
- y0 = max(0, min(H, y0))
69
- y1 = max(0, min(H, y1))
70
- if x1 <= x0 or y1 <= y0:
71
- return None
72
- return pil_img.crop((x0, y0, x1, y1))
73
-
74
- # -------------------
75
- # Captions
76
- # -------------------
77
- def find_caption_blocks(page):
78
- caps = []
79
- blocks = page.get_text("blocks")
80
- for b in blocks:
81
- x0, y0, x1, y1, text = b[0], b[1], b[2], b[3], b[4]
82
- t = " ".join(str(text).strip().split())
83
- if CAP_RE.match(t):
84
- caps.append({"bbox": (x0, y0, x1, y1), "text": t})
85
- return caps
86
-
87
- # -------------------
88
- # Dedupe: dHash
89
- # -------------------
90
- def dhash64(pil_img):
91
- gray = pil_img.convert("L").resize((9, 8), Image.LANCZOS)
92
- pixels = list(gray.getdata())
93
- bits = 0
94
- for r in range(8):
95
- for c in range(8):
96
- left = pixels[r * 9 + c]
97
- right = pixels[r * 9 + c + 1]
98
- bits = (bits << 1) | (1 if left > right else 0)
99
- return bits
100
-
101
- # -------------------
102
- # Rejectors
103
- # -------------------
104
- def has_colorbar_like_strip(pil_img):
105
- img = np.array(pil_img)
106
- if img.ndim != 3:
107
- return False
108
- H, W, _ = img.shape
109
- if W < 250 or H < 150:
110
- return False
111
- strip_w = max(18, int(0.07 * W))
112
- strip = img[:, W-strip_w:W, :]
113
- q = (strip // 24).reshape(-1, 3)
114
- uniq = np.unique(q, axis=0)
115
- return len(uniq) > 70
116
-
117
- def texture_score(pil_img):
118
- gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
119
- lap = cv2.Laplacian(gray, cv2.CV_64F)
120
- return float(lap.var())
121
-
122
- def is_mostly_legend(pil_img):
123
- gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
124
- bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
125
- bw = cv2.medianBlur(bw, 3)
126
- H, W = bw.shape
127
- fill = float(np.count_nonzero(bw)) / float(H * W)
128
- return (0.03 < fill < 0.18) and (min(H, W) < 260)
129
-
130
- # -------------------
131
- # Plot detection
132
- # -------------------
133
- def detect_axes_lines(pil_img):
134
- gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
135
- edges = cv2.Canny(gray, 50, 150)
136
- H, W = gray.shape
137
- min_len = int(0.28 * min(H, W))
138
-
139
- lines = cv2.HoughLinesP(
140
- edges, 1, np.pi/180,
141
- threshold=90,
142
- minLineLength=min_len,
143
- maxLineGap=14
144
- )
145
- if lines is None:
146
- return None, None
147
-
148
- horizontals, verticals = [], []
149
- for x1, y1, x2, y2 in lines[:, 0]:
150
- dx, dy = abs(x2-x1), abs(y2-y1)
151
- length = math.hypot(dx, dy)
152
- if dy < 18 and dx > 0.35 * W:
153
- horizontals.append((length, (x1, y1, x2, y2)))
154
- if dx < 18 and dy > 0.35 * H:
155
- verticals.append((length, (x1, y1, x2, y2)))
156
-
157
- if not horizontals or not verticals:
158
- return None, None
159
-
160
- horizontals.sort(key=lambda t: t[0], reverse=True)
161
- verticals.sort(key=lambda t: t[0], reverse=True)
162
- return horizontals[0][1], verticals[0][1]
163
-
164
- def axis_intersection_ok(x_axis, y_axis, W, H):
165
- xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
166
- ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
167
- if not (0 <= xa_y < H and 0 <= ya_x < W):
168
- return False
169
- if ya_x > int(0.95 * W) or xa_y < int(0.05 * H):
170
- return False
171
- return True
172
-
173
- def tick_text_presence_score(pil_img, x_axis, y_axis):
174
- img = np.array(pil_img)
175
- gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
176
- bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
177
- bw = cv2.medianBlur(bw, 3)
178
-
179
- H, W = gray.shape
180
- xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
181
- ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
182
-
183
- y0a = max(0, xa_y - 40)
184
- y1a = min(H, xa_y + 110)
185
- x_roi = bw[y0a:y1a, 0:W]
186
-
187
- x0b = max(0, ya_x - 180)
188
- x1b = min(W, ya_x + 50)
189
- y_roi = bw[0:H, x0b:x1b]
190
-
191
- def count_small_components(mask):
192
- num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
193
- cnt = 0
194
- for i in range(1, num):
195
- x, y, w, h, area = stats[i]
196
- if 4 <= w <= 150 and 4 <= h <= 150 and 20 <= area <= 5000:
197
- cnt += 1
198
- return cnt
199
-
200
- return count_small_components(x_roi) + count_small_components(y_roi)
201
-
202
- def is_real_plot(pil_img):
203
- if has_colorbar_like_strip(pil_img):
204
- return False
205
- if is_mostly_legend(pil_img):
206
- return False
207
-
208
- x_axis, y_axis = detect_axes_lines(pil_img)
209
- if x_axis is None or y_axis is None:
210
- return False
211
-
212
- arr = np.array(pil_img)
213
- H, W = arr.shape[0], arr.shape[1]
214
- if not axis_intersection_ok(x_axis, y_axis, W, H):
215
- return False
216
-
217
- if texture_score(pil_img) > 2200:
218
- return False
219
-
220
- score = tick_text_presence_score(pil_img, x_axis, y_axis)
221
- return score >= 18
222
-
223
- # -------------------
224
- # Candidate boxes in a region
225
- # -------------------
226
- def connected_components_boxes(pil_img):
227
- img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
228
- gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
229
- mask = (gray < 245).astype(np.uint8) * 255
230
- mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((7, 7), np.uint8), iterations=2)
231
- num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
232
-
233
- boxes = []
234
- for i in range(1, num):
235
- x, y, w, h, area = stats[i]
236
- boxes.append((int(area), (int(x), int(y), int(x + w), int(y + h))))
237
- boxes.sort(key=lambda t: t[0], reverse=True)
238
- return boxes
239
-
240
- def expand_box(box, W, H, left=0.10, right=0.06, top=0.06, bottom=0.18):
241
- x0, y0, x1, y1 = box
242
- bw = x1 - x0
243
- bh = y1 - y0
244
- ex0 = max(0, int(x0 - left * bw))
245
- ex1 = min(W, int(x1 + right * bw))
246
- ey0 = max(0, int(y0 - top * bh))
247
- ey1 = min(H, int(y1 + bottom * bh))
248
- return (ex0, ey0, ex1, ey1)
249
-
250
- # -------------------
251
- # Crop plot from caption
252
- # -------------------
253
- def crop_plot_from_caption(page_img, cap_bbox_pdf, mat):
254
- cap_px = pdf_to_px_bbox(cap_bbox_pdf, mat)
255
- cap_y0 = cap_px[1]
256
- cap_y1 = cap_px[3]
257
-
258
- W, H = page_img.size
259
- search_top = max(0, cap_y0 - int(0.95 * H))
260
- search_bot = min(H, cap_y1 + int(0.20 * H))
261
- region = safe_crop_px(page_img, (0, search_top, W, search_bot))
262
- if region is None:
263
- return None
264
-
265
- comps = connected_components_boxes(region)
266
- best = None
267
- best_area = -1
268
-
269
- for area, box in comps[:35]:
270
- x0, y0, x1, y1 = box
271
- bw = x1 - x0
272
- bh = y1 - y0
273
- if bw < 220 or bh < 180:
274
- continue
275
-
276
- exp = expand_box(box, region.size[0], region.size[1])
277
- cand = safe_crop_px(region, exp)
278
- if cand is None:
279
- continue
280
-
281
- if not is_real_plot(cand):
282
- continue
283
-
284
- if area > best_area:
285
- best_area = area
286
- best = cand
287
-
288
- return best
289
-
290
- # -------------------
291
- # Streamlit UI
292
- # -------------------
293
- def run_extraction(pdf_path, paper_id="uploaded_paper"):
294
- out_paper = os.path.join(OUT_DIR, paper_id)
295
- out_imgs = os.path.join(out_paper, "plots_with_axes")
296
- os.makedirs(out_imgs, exist_ok=True)
297
-
298
- doc = fitz.open(pdf_path)
299
- results = []
300
- seen = set()
301
- saved = 0
302
-
303
- for p in range(len(doc)):
304
- page = doc[p]
305
- caps = find_caption_blocks(page)
306
- if not caps:
307
- continue
308
-
309
- page_img, mat = render_page(page, dpi=DPI)
310
-
311
- for cap in caps:
312
- cap_text = cap["text"]
313
-
314
- if KEEP_ONLY_STRESS_STRAIN and not SS_KW.search(cap_text):
315
- continue
316
-
317
- fig = crop_plot_from_caption(page_img, cap["bbox"], mat)
318
- if fig is None:
319
- continue
320
-
321
- if fig.size[0] > 8 and fig.size[1] > 8:
322
- fig = fig.crop((2, 2, fig.size[0]-2, fig.size[1]-2))
323
-
324
- try:
325
- h = dhash64(fig)
326
- except Exception:
327
- continue
328
-
329
- if h in seen:
330
- continue
331
- seen.add(h)
332
-
333
- img_name = f"p{p+1:02d}_{saved:04d}.png"
334
- img_path = os.path.join(out_imgs, img_name)
335
- fig.save(img_path)
336
-
337
- results.append({
338
- "page": p + 1,
339
- "caption": cap_text,
340
- "image": img_path
341
- })
342
- saved += 1
343
-
344
- out_json = os.path.join(out_paper, "plots_with_axes.json")
345
- with open(out_json, "w", encoding="utf-8") as f:
346
- json.dump(results, f, indent=2, ensure_ascii=False)
347
-
348
- return results, out_json
349
-
350
- def main():
351
- st.set_page_config(page_title="Research Paper Plot Extractor", layout="wide")
352
- st.title(" Plot Extractor (Upload PDF)")
353
-
354
- uploaded = st.file_uploader("Upload a research paper PDF", type=["pdf"])
355
- if not uploaded:
356
- st.info("Upload a PDF to extract plots.")
357
- return
358
-
359
- paper_id = os.path.splitext(uploaded.name)[0].replace(" ", "_")
360
-
361
- with tempfile.TemporaryDirectory() as tmpdir:
362
- pdf_path = os.path.join(tmpdir, uploaded.name)
363
- with open(pdf_path, "wb") as f:
364
- f.write(uploaded.read())
365
-
366
- with st.spinner("Extracting plots..."):
367
- results, out_json = run_extraction(pdf_path, paper_id=paper_id)
368
-
369
- st.success(f"Extracted {len(results)} plots.")
370
-
371
- # Show images + captions
372
- for r in results:
373
- st.markdown(f"**Page {r['page']}** — {r['caption']}")
374
- st.image(r["image"], use_container_width=True)
375
- st.divider()
376
-
377
- # JSON viewer + download
378
- st.subheader("JSON Output")
379
- st.json(results)
380
-
381
- with open(out_json, "rb") as f:
382
- st.download_button(
383
- "Download JSON",
384
- data=f,
385
- file_name=os.path.basename(out_json),
386
- mime="application/json"
387
- )
388
-
389
- if __name__ == "__main__":
390
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/categorized/ESS-min.jpg DELETED

Git LFS Details

  • SHA256: ff58c9304c39dc90ca15b516a1f1ec385ea60a9829c5dd9eb698ee1f82778eb7
  • Pointer size: 131 Bytes
  • Size of remote file: 356 kB
src/pages/pages/categorized/Temp_Backup.py DELETED
@@ -1,736 +0,0 @@
1
- import os
2
- import re
3
- import json
4
- import math
5
- import tempfile
6
- import fitz # PyMuPDF
7
- import cv2
8
- import numpy as np
9
- from PIL import Image
10
- import streamlit as st
11
- import pandas as pd
12
- import requests
13
- import base64
14
- from typing import Dict, Any, Optional
15
-
16
- API_KEY = "AIzaSyAruLR2WyiaL9PquOXOhHF4wMn7tfYZWek"
17
- API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
18
-
19
- SCHEMA = {
20
- "type": "OBJECT",
21
- "properties": {
22
- "material_name": {"type": "STRING"},
23
- "material_abbreviation": {"type": "STRING"},
24
- "mechanical_properties": {
25
- "type": "ARRAY",
26
- "items": {
27
- "type": "OBJECT",
28
- "properties": {
29
- "section": {"type": "STRING"},
30
- "property_name": {"type": "STRING"},
31
- "value": {"type": "STRING"},
32
- "unit": {"type": "STRING"},
33
- "english": {"type": "STRING"},
34
- "test_condition": {"type": "STRING"},
35
- "comments": {"type": "STRING"}
36
- },
37
- "required": ["section", "property_name", "value", "english", "comments"]
38
- }
39
- }
40
- }
41
- }
42
- def make_abbreviation(name: str) -> str:
43
- """Create a simple abbreviation from the material name."""
44
- if not name:
45
- return "UNKNOWN"
46
- words = name.split()
47
- abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
48
- return abbr or name[:6].upper()
49
-
50
- DPI = 300
51
- OUT_DIR = "outputs"
52
- KEEP_ONLY_STRESS_STRAIN = False
53
- CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
54
- SS_KW = re.compile(
55
- r"(stress\s*[-–]?\s*strain|stress|strain|tensile|MPa|GPa|kN|yield|elongation)",
56
- re.IGNORECASE
57
- )
58
-
59
- def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
60
- """Calls Gemini API with PDF bytes"""
61
- try:
62
- encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
63
- mime_type = "application/pdf"
64
- except Exception as e:
65
- st.error(f"Error encoding PDF: {e}")
66
- return None
67
-
68
- prompt = (
69
- "You are an expert materials scientist. From the attached PDF, extract the material name, "
70
- "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
71
- "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
72
- "For each property, you MUST extract:\n"
73
- "- property_name\n- value (or range)\n- unit\n"
74
- "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
75
- "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
76
- "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
77
- )
78
-
79
- payload = {
80
- "contents": [{
81
- "parts": [
82
- {"text": prompt},
83
- {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
84
- ]
85
- }],
86
- "generationConfig": {
87
- "temperature": 0,
88
- "responseMimeType": "application/json",
89
- "responseSchema": SCHEMA
90
- }
91
- }
92
-
93
- try:
94
- r = requests.post(API_URL, json=payload, timeout=300)
95
- r.raise_for_status()
96
- data = r.json()
97
-
98
- candidates = data.get("candidates", [])
99
- if not candidates:
100
- return None
101
-
102
- parts = candidates[0].get("content", {}).get("parts", [])
103
- json_text = None
104
- for p in parts:
105
- t = p.get("text", "")
106
- if t.strip().startswith("{"):
107
- json_text = t
108
- break
109
-
110
- return json.loads(json_text) if json_text else None
111
- except Exception as e:
112
- st.error(f"Gemini API Error: {e}")
113
- return None
114
-
115
- # def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
116
- # """Convert extracted JSON to DataFrame"""
117
- # rows = []
118
- # for item in data.get("mechanical_properties", []):
119
- # rows.append({
120
- # "material_name": data.get("material_name", ""),
121
- # "material_abbreviation": data.get("material_abbreviation", ""),
122
- # "section": item.get("section", ""),
123
- # "property_name": item.get("property_name", ""),
124
- # "value": item.get("value", ""),
125
- # "unit": item.get("unit", ""),
126
- # "english": item.get("english", ""),
127
- # "test_condition": item.get("test_condition", ""),
128
- # "comments": item.get("comments", "")
129
- # })
130
- # return pd.DataFrame(rows)
131
- def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
132
- """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
133
- mat_name = data.get("material_name", "") or ""
134
- mat_abbr = data.get("material_abbreviation", "") or ""
135
-
136
- if not mat_abbr:
137
- mat_abbr = make_abbreviation(mat_name)
138
-
139
- rows = []
140
- for item in data.get("mechanical_properties", []):
141
- rows.append({
142
- "material_name": mat_name,
143
- "material_abbreviation": mat_abbr,
144
- "section": item.get("section", "") or "Mechanical",
145
- "property_name": item.get("property_name", "") or "Unknown property",
146
- "value": item.get("value", "") or "N/A",
147
- "unit": item.get("unit", "") or "",
148
- "english": item.get("english", "") or "",
149
- "test_condition": item.get("test_condition", "") or "",
150
- "comments": item.get("comments", "") or "",
151
- })
152
- return pd.DataFrame(rows)
153
-
154
- def render_page(page, dpi=DPI):
155
- mat = fitz.Matrix(dpi/72, dpi/72)
156
- pix = page.get_pixmap(matrix=mat, alpha=False)
157
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
158
- return img, mat
159
-
160
- def pdf_to_px_bbox(bbox_pdf, mat):
161
- x0, y0, x1, y1 = bbox_pdf
162
- sx, sy = mat.a, mat.d
163
- return (int(float(x0) * sx), int(float(y0) * sy), int(float(x1) * sx), int(float(y1) * sy))
164
-
165
- def safe_crop_px(pil_img, box):
166
- if not isinstance(box, (tuple, list)):
167
- return None
168
- if len(box) == 1 and isinstance(box[0], (tuple, list)) and len(box[0]) == 4:
169
- box = box[0]
170
- if len(box) != 4:
171
- return None
172
-
173
- x0, y0, x1, y1 = box
174
- if any(isinstance(v, (tuple, list)) for v in (x0, y0, x1, y1)):
175
- return None
176
-
177
- try:
178
- x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
179
- except (TypeError, ValueError):
180
- return None
181
-
182
- if x1 < x0: x0, x1 = x1, x0
183
- if y1 < y0: y0, y1 = y1, y0
184
-
185
- W, H = pil_img.size
186
- x0 = max(0, min(W, x0))
187
- x1 = max(0, min(W, x1))
188
- y0 = max(0, min(H, y0))
189
- y1 = max(0, min(H, y1))
190
- if x1 <= x0 or y1 <= y0:
191
- return None
192
- return pil_img.crop((x0, y0, x1, y1))
193
-
194
- def find_caption_blocks(page):
195
- caps = []
196
- blocks = page.get_text("blocks")
197
- for b in blocks:
198
- x0, y0, x1, y1, text = b[0], b[1], b[2], b[3], b[4]
199
- t = " ".join(str(text).strip().split())
200
- if CAP_RE.match(t):
201
- caps.append({"bbox": (x0, y0, x1, y1), "text": t})
202
- return caps
203
-
204
- def dhash64(pil_img):
205
- gray = pil_img.convert("L").resize((9, 8), Image.LANCZOS)
206
- pixels = list(gray.getdata())
207
- bits = 0
208
- for r in range(8):
209
- for c in range(8):
210
- left = pixels[r * 9 + c]
211
- right = pixels[r * 9 + c + 1]
212
- bits = (bits << 1) | (1 if left > right else 0)
213
- return bits
214
-
215
- def has_colorbar_like_strip(pil_img):
216
- img = np.array(pil_img)
217
- if img.ndim != 3:
218
- return False
219
- H, W, _ = img.shape
220
- if W < 250 or H < 150:
221
- return False
222
- strip_w = max(18, int(0.07 * W))
223
- strip = img[:, W-strip_w:W, :]
224
- q = (strip // 24).reshape(-1, 3)
225
- uniq = np.unique(q, axis=0)
226
- return len(uniq) > 70
227
-
228
- def texture_score(pil_img):
229
- gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
230
- lap = cv2.Laplacian(gray, cv2.CV_64F)
231
- return float(lap.var())
232
-
233
- def is_mostly_legend(pil_img):
234
- gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
235
- bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
236
- bw = cv2.medianBlur(bw, 3)
237
- H, W = bw.shape
238
- fill = float(np.count_nonzero(bw)) / float(H * W)
239
- return (0.03 < fill < 0.18) and (min(H, W) < 260)
240
-
241
- def detect_axes_lines(pil_img):
242
- gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
243
- edges = cv2.Canny(gray, 50, 150)
244
- H, W = gray.shape
245
- min_len = int(0.28 * min(H, W))
246
-
247
- lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=90, minLineLength=min_len, maxLineGap=14)
248
- if lines is None:
249
- return None, None
250
-
251
- horizontals, verticals = [], []
252
- for x1, y1, x2, y2 in lines[:, 0]:
253
- dx, dy = abs(x2-x1), abs(y2-y1)
254
- length = math.hypot(dx, dy)
255
- if dy < 18 and dx > 0.35 * W:
256
- horizontals.append((length, (x1, y1, x2, y2)))
257
- if dx < 18 and dy > 0.35 * H:
258
- verticals.append((length, (x1, y1, x2, y2)))
259
-
260
- if not horizontals or not verticals:
261
- return None, None
262
-
263
- horizontals.sort(key=lambda t: t[0], reverse=True)
264
- verticals.sort(key=lambda t: t[0], reverse=True)
265
- return horizontals[0][1], verticals[0][1]
266
-
267
- def axis_intersection_ok(x_axis, y_axis, W, H):
268
- xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
269
- ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
270
- if not (0 <= xa_y < H and 0 <= ya_x < W):
271
- return False
272
- if ya_x > int(0.95 * W) or xa_y < int(0.05 * H):
273
- return False
274
- return True
275
-
276
- def tick_text_presence_score(pil_img, x_axis, y_axis):
277
- img = np.array(pil_img)
278
- gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
279
- bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
280
- bw = cv2.medianBlur(bw, 3)
281
-
282
- H, W = gray.shape
283
- xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
284
- ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
285
-
286
- y0a = max(0, xa_y - 40)
287
- y1a = min(H, xa_y + 110)
288
- x_roi = bw[y0a:y1a, 0:W]
289
-
290
- x0b = max(0, ya_x - 180)
291
- x1b = min(W, ya_x + 50)
292
- y_roi = bw[0:H, x0b:x1b]
293
-
294
- def count_small_components(mask):
295
- num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
296
- cnt = 0
297
- for i in range(1, num):
298
- x, y, w, h, area = stats[i]
299
- if 4 <= w <= 150 and 4 <= h <= 150 and 20 <= area <= 5000:
300
- cnt += 1
301
- return cnt
302
-
303
- return count_small_components(x_roi) + count_small_components(y_roi)
304
-
305
- def is_real_plot(pil_img):
306
- if has_colorbar_like_strip(pil_img):
307
- return False
308
- if is_mostly_legend(pil_img):
309
- return False
310
-
311
- x_axis, y_axis = detect_axes_lines(pil_img)
312
- if x_axis is None or y_axis is None:
313
- return False
314
-
315
- arr = np.array(pil_img)
316
- H, W = arr.shape[0], arr.shape[1]
317
- if not axis_intersection_ok(x_axis, y_axis, W, H):
318
- return False
319
-
320
- if texture_score(pil_img) > 2200:
321
- return False
322
-
323
- score = tick_text_presence_score(pil_img, x_axis, y_axis)
324
- return score >= 18
325
-
326
- def connected_components_boxes(pil_img):
327
- img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
328
- gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
329
- mask = (gray < 245).astype(np.uint8) * 255
330
- mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((7, 7), np.uint8), iterations=2)
331
- num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
332
-
333
- boxes = []
334
- for i in range(1, num):
335
- x, y, w, h, area = stats[i]
336
- boxes.append((int(area), (int(x), int(y), int(x + w), int(y + h))))
337
- boxes.sort(key=lambda t: t[0], reverse=True)
338
- return boxes
339
-
340
- def expand_box(box, W, H, left=0.10, right=0.06, top=0.06, bottom=0.18):
341
- x0, y0, x1, y1 = box
342
- bw = x1 - x0
343
- bh = y1 - y0
344
- ex0 = max(0, int(x0 - left * bw))
345
- ex1 = min(W, int(x1 + right * bw))
346
- ey0 = max(0, int(y0 - top * bh))
347
- ey1 = min(H, int(y1 + bottom * bh))
348
- return (ex0, ey0, ex1, ey1)
349
-
350
- def crop_plot_from_caption(page_img, cap_bbox_pdf, mat):
351
- cap_px = pdf_to_px_bbox(cap_bbox_pdf, mat)
352
- cap_y0 = cap_px[1]
353
- cap_y1 = cap_px[3]
354
-
355
- W, H = page_img.size
356
- search_top = max(0, cap_y0 - int(0.95 * H))
357
- search_bot = min(H, cap_y1 + int(0.20 * H))
358
- region = safe_crop_px(page_img, (0, search_top, W, search_bot))
359
- if region is None:
360
- return None
361
-
362
- comps = connected_components_boxes(region)
363
- best = None
364
- best_area = -1
365
-
366
- for area, box in comps[:35]:
367
- x0, y0, x1, y1 = box
368
- bw = x1 - x0
369
- bh = y1 - y0
370
- if bw < 220 or bh < 180:
371
- continue
372
-
373
- exp = expand_box(box, region.size[0], region.size[1])
374
- cand = safe_crop_px(region, exp)
375
- if cand is None:
376
- continue
377
-
378
- if not is_real_plot(cand):
379
- continue
380
-
381
- if area > best_area:
382
- best_area = area
383
- best = cand
384
-
385
- return best
386
-
387
- def extract_images(pdf_path, paper_id="uploaded_paper"):
388
- """Extract plot images from PDF"""
389
- out_paper = os.path.join(OUT_DIR, paper_id)
390
- out_imgs = os.path.join(out_paper, "plots_with_axes")
391
- os.makedirs(out_imgs, exist_ok=True)
392
-
393
- doc = fitz.open(pdf_path)
394
- results = []
395
- seen = set()
396
- saved = 0
397
-
398
- for p in range(len(doc)):
399
- page = doc[p]
400
- caps = find_caption_blocks(page)
401
- if not caps:
402
- continue
403
-
404
- page_img, mat = render_page(page, dpi=DPI)
405
-
406
- for cap in caps:
407
- cap_text = cap["text"]
408
-
409
- if KEEP_ONLY_STRESS_STRAIN and not SS_KW.search(cap_text):
410
- continue
411
-
412
- fig = crop_plot_from_caption(page_img, cap["bbox"], mat)
413
- if fig is None:
414
- continue
415
-
416
- if fig.size[0] > 8 and fig.size[1] > 8:
417
- fig = fig.crop((2, 2, fig.size[0]-2, fig.size[1]-2))
418
-
419
- try:
420
- h = dhash64(fig)
421
- except Exception:
422
- continue
423
-
424
- if h in seen:
425
- continue
426
- seen.add(h)
427
-
428
- img_name = f"p{p+1:02d}_{saved:04d}.png"
429
- img_path = os.path.join(out_imgs, img_name)
430
- fig.save(img_path)
431
-
432
- results.append({
433
- "page": p + 1,
434
- "caption": cap_text,
435
- "image": img_path
436
- })
437
- saved += 1
438
-
439
- return results
440
-
441
- def input_form():
442
- PROPERTY_CATEGORIES = {
443
- "Polymer": [
444
- "Thermal",
445
- "Mechanical",
446
- "Processing",
447
- "Physical",
448
- "Descriptive",
449
- ],
450
- "Fiber": [
451
- "Mechanical",
452
- "Physical",
453
- "Thermal",
454
- "Descriptive",
455
- ],
456
- "Composite": [
457
- "Mechanical",
458
- "Thermal",
459
- "Processing",
460
- "Physical",
461
- "Descriptive",
462
- "Composition / Reinforcement",
463
- "Architecture / Structure",
464
- ],
465
- }
466
-
467
- PROPERTY_NAMES = {
468
- "Polymer": {
469
- "Thermal": [
470
- "Glass transition temperature (Tg)",
471
- "Melting temperature (Tm)",
472
- "Crystallization temperature (Tc)",
473
- "Degree of crystallinity",
474
- "Decomposition temperature",
475
- ],
476
- "Mechanical": [
477
- "Tensile modulus",
478
- "Tensile strength",
479
- "Elongation at break",
480
- "Flexural modulus",
481
- "Impact strength",
482
- ],
483
- "Processing": [
484
- "Melt flow index (MFI)",
485
- "Processing temperature",
486
- "Cooling rate",
487
- "Mold shrinkage",
488
- ],
489
- "Physical": [
490
- "Density",
491
- "Specific gravity",
492
- ],
493
- "Descriptive": [
494
- "Material grade",
495
- "Manufacturer",
496
- ],
497
- },
498
-
499
- "Fiber": {
500
- "Mechanical": [
501
- "Tensile modulus",
502
- "Tensile strength",
503
- "Strain to failure",
504
- ],
505
- "Physical": [
506
- "Density",
507
- "Fiber diameter",
508
- ],
509
- "Thermal": [
510
- "Decomposition temperature",
511
- ],
512
- "Descriptive": [
513
- "Fiber type",
514
- "Surface treatment",
515
- ],
516
- },
517
-
518
- "Composite": {
519
- "Mechanical": [
520
- "Longitudinal modulus (E1)",
521
- "Transverse modulus (E2)",
522
- "Shear modulus (G12)",
523
- "Poissons ratio (V12)",
524
- "Tensile strength (fiber direction)",
525
- "Interlaminar shear strength",
526
- ],
527
- "Thermal": [
528
- "Glass transition temperature (matrix)",
529
- "Coefficient of thermal expansion (CTE)",
530
- ],
531
- "Processing": [
532
- "Curing temperature",
533
- "Curing pressure",
534
- ],
535
- "Physical": [
536
- "Density",
537
- ],
538
- "Descriptive": [
539
- "Laminate type",
540
- ],
541
- "Composition / Reinforcement": [
542
- "Fiber volume fraction",
543
- "Fiber weight fraction",
544
- "Fiber type",
545
- "Matrix type",
546
- ],
547
- "Architecture / Structure": [
548
- "Weave type",
549
- "Ply orientation",
550
- "Number of plies",
551
- "Stacking sequence",
552
- ],
553
- },
554
- }
555
-
556
-
557
-
558
- st.title("Materials Property Input Form")
559
-
560
- material_class = st.selectbox(
561
- "Select Material Class",
562
- ("Polymer", "Fiber", "Composite"),
563
- index=None,
564
- placeholder="Choose material class",
565
- )
566
-
567
- if material_class:
568
- property_category = st.selectbox(
569
- "Select Property Category",
570
- PROPERTY_CATEGORIES[material_class],
571
- index=None,
572
- placeholder="Choose property category",
573
- )
574
- else:
575
- property_category = None
576
-
577
- if material_class and property_category:
578
- property_name = st.selectbox(
579
- "Select Property",
580
- PROPERTY_NAMES[material_class][property_category],
581
- index=None,
582
- placeholder="Choose property",
583
- )
584
- else:
585
- property_name = None
586
-
587
- if material_class and property_category and property_name:
588
- with st.form("user_input"):
589
- st.subheader("Enter Data")
590
-
591
- material_name = st.text_input("Material Name")
592
- material_abbr = st.text_input("Material Abbreviation")
593
-
594
- value = st.text_input("Value")
595
- unit = st.text_input("Unit (SI)")
596
- english = st.text_input("English Units")
597
- test_condition = st.text_input("Test Condition")
598
- comments = st.text_area("Comments")
599
-
600
- submitted = st.form_submit_button("Submit")
601
-
602
- if submitted:
603
- if not (material_name and value):
604
- st.error("Material name and value are required.")
605
- else:
606
- Input_db = pd.DataFrame([{
607
- "material_class": material_class,
608
- "material_name": material_name,
609
- "material_abbreviation": material_abbr,
610
- "section": property_category,
611
- "property_name": property_name,
612
- "value": value,
613
- "unit": unit,
614
- "english_units": english,
615
- "test_condition": test_condition,
616
- "comments": comments
617
- }])
618
-
619
- st.success("Property added successfully")
620
- st.dataframe(Input_db)
621
-
622
-
623
- if "user_uploaded_data" not in st.session_state:
624
- st.session_state["user_uploaded_data"] = Input_db
625
- else:
626
- st.session_state["user_uploaded_data"] = pd.concat(
627
- [st.session_state["user_uploaded_data"], Input_db],
628
- ignore_index=True
629
- )
630
- def main():
631
- input_form()
632
- st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
633
- st.title("PDF Material Data & Plot Extractor")
634
-
635
- uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
636
-
637
- if not uploaded_file:
638
- st.info("Upload a PDF to extract material data and plots")
639
- return
640
-
641
- paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
642
-
643
- tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
644
-
645
- with tempfile.TemporaryDirectory() as tmpdir:
646
- pdf_path = os.path.join(tmpdir, uploaded_file.name)
647
- with open(pdf_path, "wb") as f:
648
- f.write(uploaded_file.getbuffer())
649
-
650
- with tab1:
651
- st.subheader("Material Properties Data")
652
-
653
- with st.spinner(" Extracting material data..."):
654
- with open(pdf_path, "rb") as f:
655
- pdf_bytes = f.read()
656
-
657
- data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
658
-
659
- if data:
660
- df = convert_to_dataframe(data)
661
-
662
- if not df.empty:
663
- st.success(f"Extracted {len(df)} properties")
664
-
665
- col1, col2 = st.columns(2)
666
- with col1:
667
- st.metric("Material", data.get("material_name", "N/A"))
668
- with col2:
669
- st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
670
-
671
- st.dataframe(df, use_container_width=True, height=400)
672
- st.subheader("Assign Material Category")
673
-
674
- extracted_material_class = st.selectbox(
675
- "Select category for this material",
676
- ["Polymer", "Fiber", "Composite"],
677
- index=None,
678
- placeholder="Required before adding to database"
679
- )
680
- if st.button(" Add to Database"):
681
- if not extracted_material_class:
682
- st.error("Please select a material category before adding.")
683
- else:
684
- df["material_class"] = extracted_material_class
685
-
686
- if "user_uploaded_data" not in st.session_state:
687
- st.session_state["user_uploaded_data"] = df
688
- else:
689
- st.session_state["user_uploaded_data"] = pd.concat(
690
- [st.session_state["user_uploaded_data"], df],
691
- ignore_index=True
692
- )
693
-
694
- st.success(f"Added to {extracted_material_class} database!")
695
-
696
- # if st.button(" Add to Database"):
697
- # if "user_uploaded_data" not in st.session_state:
698
- # st.session_state["user_uploaded_data"] = df
699
- # else:
700
- # st.session_state["user_uploaded_data"] = pd.concat(
701
- # [st.session_state["user_uploaded_data"], df],
702
- # ignore_index=True
703
- # )
704
- # st.success("Added to database!")
705
-
706
- csv = df.to_csv(index=False)
707
- st.download_button(
708
- "Download CSV",
709
- data=csv,
710
- file_name=f"{paper_id}_data.csv",
711
- mime="text/csv"
712
- )
713
- else:
714
- st.warning("No data extracted")
715
- else:
716
- st.error("Failed to extract data from PDF")
717
-
718
- with tab2:
719
- st.subheader("Extracted Plot Images")
720
-
721
- with st.spinner(" Extracting plots from PDF..."):
722
- image_results = extract_images(pdf_path, paper_id=paper_id)
723
-
724
- if image_results:
725
- st.success(f" Extracted {len(image_results)} plots")
726
-
727
- for r in image_results:
728
- st.markdown(f"**Page {r['page']}** — {r['caption']}")
729
- st.image(r["image"], use_container_width=True)
730
- st.divider()
731
- else:
732
- st.warning("No plots found in PDF")
733
-
734
-
735
- if __name__ == "__main__":
736
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/categorized/__pycache__/page1.cpython-312.pyc DELETED
Binary file (4.86 kB)
 
src/pages/pages/categorized/__pycache__/page1.cpython-313.pyc DELETED
Binary file (4.94 kB)
 
src/pages/pages/categorized/__pycache__/page1.cpython-314.pyc DELETED
Binary file (9.83 kB)
 
src/pages/pages/categorized/__pycache__/page2.cpython-312.pyc DELETED
Binary file (596 Bytes)
 
src/pages/pages/categorized/__pycache__/page2.cpython-313.pyc DELETED
Binary file (596 Bytes)
 
src/pages/pages/categorized/__pycache__/page2.cpython-314.pyc DELETED
Binary file (672 Bytes)
 
src/pages/pages/categorized/__pycache__/page3.cpython-313.pyc DELETED
Binary file (596 Bytes)
 
src/pages/pages/categorized/__pycache__/page3.cpython-314.pyc DELETED
Binary file (2.93 kB)
 
src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc DELETED
Binary file (34 kB)
 
src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc.2029864538672 DELETED
Binary file (8.01 kB)
 
src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc.2097035857760 DELETED
Binary file (1.22 kB)
 
src/pages/pages/categorized/page1.py DELETED
@@ -1,307 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- from PIL import Image
4
- import re
5
-
6
- def extract_matrix_fiber_from_abbr(abbr: str):
7
- if not isinstance(abbr, str):
8
- return None, None
9
-
10
- text = abbr.lower()
11
-
12
- matrix_map = {
13
- "epoxy": "Epoxy",
14
- "cyanate ester": "Cyanate Ester",
15
- "cynate ester": "Cyanate Ester",
16
- "polypropylene": "Polypropylene",
17
- "pp": "Polypropylene",
18
- "peek": "PEEK",
19
- "pei": "PEI",
20
- "nylon": "Nylon",
21
- "pa6": "PA6",
22
- "polyester": "Polyester",
23
- "vinyl ester": "Vinyl Ester",
24
- "phenolic": "Phenolic"
25
- }
26
-
27
- matrix = None
28
- for key, val in matrix_map.items():
29
- if key in text:
30
- matrix = val
31
- break
32
-
33
- fiber_map = {
34
- "carbon": "Carbon Fiber",
35
- "glass": "Glass Fiber",
36
- "e-glass": "E-Glass Fiber",
37
- "s-glass": "S-Glass Fiber",
38
- "aramid": "Aramid Fiber",
39
- "kevlar": "Kevlar Fiber",
40
- "basalt": "Basalt Fiber",
41
- "natural": "Natural Fiber"
42
- }
43
-
44
- fiber = None
45
- for key, val in fiber_map.items():
46
- if key in text:
47
- fiber = val
48
- break
49
-
50
- return matrix, fiber
51
-
52
-
53
- def main():
54
- st.set_page_config(layout="wide")
55
-
56
- mat_section = st.sidebar.expander("Materials", expanded=False)
57
- with mat_section:
58
- thermo = mat_section.button("Composites")
59
- polymers = mat_section.button("Polymers")
60
- Fibers = mat_section.button("Fibers")
61
-
62
- if "material_type" not in st.session_state:
63
- st.session_state.material_type = "Composites"
64
-
65
- if thermo:
66
- st.session_state.material_type = "Composites"
67
- elif polymers:
68
- st.session_state.material_type = "Polymers"
69
- elif Fibers:
70
- st.session_state.material_type = "Fibers"
71
-
72
- @st.cache_data
73
- def load_data(material_type):
74
- file_map = {
75
- "Composites": "src/data/data/Composites_material_data.csv",
76
- "Polymers": "src/data/data/polymers_material_data.csv",
77
- "Fibers": "src/data/data/Fibers_material_data.csv",
78
- }
79
- return pd.read_csv(file_map[material_type])
80
-
81
- csv_data = load_data(st.session_state.material_type)
82
-
83
- # if "user_uploaded_data" in st.session_state:
84
- # df = pd.concat([csv_data, st.session_state["user_uploaded_data"]], ignore_index=True)
85
- # else:
86
- # df = csv_data
87
- # Normalize naming between pages
88
- CLASS_MAP = {
89
- "Polymers": "Polymer",
90
- "Fibers": "Fiber",
91
- "Composites": "Composite",
92
- }
93
-
94
- current_class = CLASS_MAP[st.session_state.material_type]
95
-
96
- if "user_uploaded_data" in st.session_state:
97
- user_df = st.session_state["user_uploaded_data"]
98
-
99
- filtered_user_df = user_df[
100
- user_df["material_class"] == current_class
101
- ]
102
-
103
- df = pd.concat([csv_data, filtered_user_df], ignore_index=True)
104
- else:
105
- df = csv_data
106
-
107
-
108
- st.session_state["base_data"] = df
109
-
110
- st.title("Materials DataSet")
111
-
112
- materials_df = (
113
- df[["material_abbreviation", "material_name"]]
114
- .fillna("")
115
- .drop_duplicates()
116
- .reset_index(drop=True)
117
- )
118
-
119
- materials_df[["Matrix", "Fiber"]] = materials_df["material_abbreviation"].apply(
120
- lambda x: pd.Series(extract_matrix_fiber_from_abbr(x))
121
- )
122
-
123
-
124
- col1, col2 = st.columns(2, vertical_alignment="center")
125
-
126
- # st.subheader("Filter Composites")
127
-
128
- # matrix_options = sorted(
129
- # materials_df["Matrix"].dropna().unique()
130
- # )
131
-
132
- # fiber_options = sorted(
133
- # materials_df["Fiber"].dropna().unique()
134
- # )
135
-
136
- # fcol1, fcol2 = st.columns(2)
137
-
138
- # with fcol1:
139
- # selected_matrix = st.selectbox(
140
- # "Matrix Material",
141
- # ["All"] + matrix_options
142
- # )
143
-
144
- # with fcol2:
145
- # selected_fiber = st.selectbox(
146
- # "Fiber Material",
147
- # ["All"] + fiber_options
148
- # )
149
-
150
-
151
- # filtered_materials_df = materials_df.copy()
152
-
153
- # if selected_matrix != "All":
154
- # filtered_materials_df = filtered_materials_df[
155
- # filtered_materials_df["Matrix"] == selected_matrix
156
- # ]
157
-
158
- # if selected_fiber != "All":
159
- # filtered_materials_df = filtered_materials_df[
160
- # filtered_materials_df["Fiber"] == selected_fiber
161
- # ]
162
-
163
-
164
- with col1:
165
- st.write("Filter Composites")
166
-
167
- selected_matrix = "All"
168
- selected_fiber = "All"
169
-
170
- if st.session_state.material_type == "Composites":
171
-
172
-
173
- matrix_options = sorted(
174
- materials_df["Matrix"].dropna().unique()
175
- )
176
-
177
- fiber_options = sorted(
178
- materials_df["Fiber"].dropna().unique()
179
- )
180
-
181
- fcol1, fcol2 = st.columns(2)
182
-
183
- with fcol1:
184
- selected_matrix = st.selectbox(
185
- "Matrix Material",
186
- ["All"] + matrix_options
187
- )
188
-
189
- with fcol2:
190
- selected_fiber = st.selectbox(
191
- "Fiber Material",
192
- ["All"] + fiber_options
193
- )
194
-
195
-
196
-
197
- filtered_materials_df = materials_df.copy()
198
-
199
- if st.session_state.material_type == "Composites":
200
- if selected_matrix != "All":
201
- filtered_materials_df = filtered_materials_df[
202
- filtered_materials_df["Matrix"] == selected_matrix
203
- ]
204
-
205
- if selected_fiber != "All":
206
- filtered_materials_df = filtered_materials_df[
207
- filtered_materials_df["Fiber"] == selected_fiber
208
- ]
209
-
210
- st.write("Select Material")
211
- st.dataframe(
212
- filtered_materials_df,
213
- key="material_table",
214
- selection_mode="single-cell",
215
- on_select="rerun",
216
- use_container_width=True,
217
- height=260
218
- )
219
-
220
- def get_selected_value(df, key, column_name):
221
- if key in st.session_state:
222
- sel = st.session_state[key]["selection"]["cells"]
223
- if sel:
224
- row_idx = sel[0][0]
225
- return df.iloc[row_idx][column_name]
226
- return None
227
-
228
-
229
- mat = get_selected_value(materials_df, "material_table", "material_abbreviation")
230
-
231
- with col2:
232
- st.write("Select Property")
233
-
234
- if mat:
235
- filtered_df = df[
236
- (df["material_abbreviation"] == mat) &
237
- (df["value"].notna()) &
238
- (df["property_name"].notna())
239
- ]
240
- property_sel = st.selectbox(
241
- "Type of Property",
242
- filtered_df["section"].drop_duplicates()
243
- )
244
-
245
- properties_df = (
246
- filtered_df[filtered_df["section"] == property_sel][["property_name", "section"]]
247
- .drop_duplicates()
248
- .reset_index(drop=True)
249
- )
250
- else:
251
- filtered_df = df[df["value"].notna() & df["property_name"].notna()]
252
- property_sel = st.selectbox(
253
- "Type of Property",
254
- filtered_df["section"].drop_duplicates()
255
- )
256
-
257
- properties_df = (
258
- filtered_df[filtered_df["section"] == property_sel][["property_name", "section"]]
259
- .drop_duplicates()
260
- .reset_index(drop=True)
261
- )
262
-
263
- st.dataframe(
264
- properties_df,
265
- key="property_table",
266
- selection_mode="single-cell",
267
- on_select="rerun",
268
- use_container_width=True,
269
- height=260
270
- )
271
-
272
- prop = get_selected_value(properties_df, "property_table", "property_name")
273
-
274
- st.write("")
275
- if st.button("Search", disabled=not (mat and prop)):
276
- st.write(f"**Material:** {mat}")
277
- st.write(f"**Property:** {prop}")
278
-
279
- result = df[
280
- (df["material_abbreviation"] == mat) &
281
- (df["property_name"] == prop) &
282
- (df["value"].notna())
283
- ]
284
-
285
- if not result.empty:
286
- st.subheader("Property Data")
287
- st.dataframe(result.T, use_container_width=True)
288
-
289
- st.subheader("Property Graph")
290
- img_path = f"src/images/images/{mat}_{prop}.png"
291
-
292
- try:
293
- img = Image.open(img_path)
294
- st.image(img, use_container_width=True, caption="Stress strain curve")
295
- except FileNotFoundError:
296
- st.write("")
297
- # fallback_img = Image.open("src/pages/pages/categorized/ESS-min.jpg")
298
- # st.image(fallback_img, use_container_width=True, caption="Stress strain curve")
299
-
300
- else:
301
- st.warning("No data found for this material-property combination")
302
-
303
-
304
-
305
-
306
-
307
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/categorized/page2.py DELETED
@@ -1,265 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import os
4
- from PIL import Image
5
- import boto3
6
- import tabula
7
- import faiss
8
- import json
9
- import base64
10
- import pymupdf
11
- import requests
12
- import os
13
- import logging
14
- import numpy as np
15
- import warnings
16
- from tqdm import tqdm
17
- from botocore.exceptions import ClientError
18
- from langchain_text_splitters import RecursiveCharacterTextSplitter
19
- from IPython import display
20
- from langchain_aws import ChatBedrock
21
-
22
-
23
- from pathlib import Path
24
-
25
- def main():
26
-
27
-
28
-
29
-
30
- logger = logging.getLogger(__name__)
31
- logger.setLevel(logging.ERROR)
32
-
33
- warnings.filterwarnings("ignore")
34
-
35
- def create_directories(base_dir):
36
- directories = ["images", "text", "tables", "page_images"]
37
- for dir in directories:
38
- os.makedirs(os.path.join(base_dir, dir), exist_ok=True)
39
-
40
-
41
- def process_tables(doc, page_num, base_dir, items):
42
- try:
43
- tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
44
- if not tables:
45
- return
46
- for table_idx, table in enumerate(tables):
47
- table_text = "\n".join([" | ".join(map(str, row)) for row in table.values])
48
- table_file_name = f"{base_dir}/tables/{os.path.basename(filepath)}_table_{page_num}_{table_idx}.txt"
49
- with open(table_file_name, 'w') as f:
50
- f.write(table_text)
51
- items.append({"page": page_num, "type": "table", "text": table_text, "path": table_file_name})
52
- except Exception as e:
53
- print(f"Error extracting tables from page {page_num}: {str(e)}")
54
-
55
- doc = pymupdf.open(filepath)
56
- num_pages = len(doc)
57
- base_dir = "data"
58
-
59
- # Creating the directories
60
- create_directories(base_dir)
61
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200, length_function=len)
62
- items = []
63
-
64
- # Process each page of the PDF
65
- for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
66
- page = doc[page_num]
67
- process_tables(doc, page_num, base_dir, items)
68
-
69
- [i for i in items if i['type'] == 'table'][0]
70
- # Generating Multimodal Embeddings using Amazon Titan Multimodal Embeddings model
71
- def generate_multimodal_embeddings(prompt=None, image=None, output_embedding_length=384):
72
- """
73
- Invoke the Amazon Titan Multimodal Embeddings model using Amazon Bedrock runtime.
74
-
75
- Args:
76
- prompt (str): The text prompt to provide to the model.
77
- image (str): A base64-encoded image data.
78
- Returns:
79
- str: The model's response embedding.
80
- """
81
- if not prompt and not image:
82
- raise ValueError("Please provide either a text prompt, base64 image, or both as input")
83
-
84
- # Initialize the Amazon Bedrock runtime client
85
- client = boto3.client(service_name="bedrock-runtime")
86
- model_id = "amazon.titan-embed-image-v1"
87
-
88
- body = {"embeddingConfig": {"outputEmbeddingLength": output_embedding_length}}
89
-
90
- if prompt:
91
- body["inputText"] = prompt
92
- if image:
93
- body["inputImage"] = image
94
-
95
- try:
96
- response = client.invoke_model(
97
- modelId=model_id,
98
- body=json.dumps(body),
99
- accept="application/json",
100
- contentType="application/json"
101
- )
102
-
103
- # Process and return the response
104
- result = json.loads(response.get("body").read())
105
- return result.get("embedding")
106
-
107
- except ClientError as err:
108
- print(f"Couldn't invoke Titan embedding model. Error: {err.response['Error']['Message']}")
109
- return None
110
-
111
- # Set embedding vector dimension
112
- embedding_vector_dimension = 384
113
-
114
- # Count the number of each type of item
115
- item_counts = {
116
- 'text': sum(1 for item in items if item['type'] == 'text'),
117
- 'table': sum(1 for item in items if item['type'] == 'table'),
118
- 'image': sum(1 for item in items if item['type'] == 'image'),
119
- 'page': sum(1 for item in items if item['type'] == 'page')
120
- }
121
-
122
- # Initialize counters
123
- counters = dict.fromkeys(item_counts.keys(), 0)
124
-
125
- # Generate embeddings for all items
126
- with tqdm(
127
- total=len(items),
128
- desc="Generating embeddings",
129
- bar_format=(
130
- "{l_bar}{bar}| {n_fmt}/{total_fmt} "
131
- "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
132
- )
133
- ) as pbar:
134
-
135
- for item in items:
136
- item_type = item['type']
137
- counters[item_type] += 1
138
-
139
- if item_type in ['text', 'table']:
140
- # For text or table, use the formatted text representation
141
- item['embedding'] = generate_multimodal_embeddings(prompt=item['text'],output_embedding_length=embedding_vector_dimension)
142
- else:
143
- # For images, use the base64-encoded image data
144
- item['embedding'] = generate_multimodal_embeddings(image=item['image'], output_embedding_length=embedding_vector_dimension)
145
-
146
- # Update the progress bar
147
- pbar.set_postfix_str(f"Text: {counters['text']}/{item_counts['text']}, Table: {counters['table']}/{item_counts['table']}, Image: {counters['image']}/{item_counts['image']}")
148
- pbar.update(1)
149
-
150
- # All the embeddings
151
- all_embeddings = np.array([item['embedding'] for item in items])
152
-
153
- # Create FAISS Index
154
- index = faiss.IndexFlatL2(embedding_vector_dimension)
155
-
156
- # Clear any pre-existing index
157
- index.reset()
158
-
159
- # Add embeddings to the index
160
- index.add(np.array(all_embeddings, dtype=np.float32))
161
-
162
- # Generating RAG response with Amazon Nova
163
- def invoke_nova_multimodal(prompt, matched_items):
164
- """
165
- Invoke the Amazon Nova model.
166
- """
167
-
168
-
169
- # Define your system prompt(s).
170
- system_msg = [
171
- { "text": """You are a helpful assistant for question answering.
172
- The text context is relevant information retrieved.
173
- The provided image(s) are relevant information retrieved."""}
174
- ]
175
-
176
- # Define one or more messages using the "user" and "assistant" roles.
177
- message_content = []
178
-
179
- for item in matched_items:
180
- if item['type'] == 'text' or item['type'] == 'table':
181
- message_content.append({"text": item['text']})
182
- else:
183
- message_content.append({"image": {
184
- "format": "png",
185
- "source": {"bytes": item['image']},
186
- }
187
- })
188
-
189
-
190
- # Configure the inference parameters.
191
- inf_params = {"max_new_tokens": 300,
192
- "top_p": 0.9,
193
- "top_k": 20}
194
-
195
- # Define the final message list
196
- message_list = [
197
- {"role": "user", "content": message_content}
198
- ]
199
-
200
- # Adding the prompt to the message list
201
- message_list.append({"role": "user", "content": [{"text": prompt}]})
202
-
203
- native_request = {
204
- "messages": message_list,
205
- "system": system_msg,
206
- "inferenceConfig": inf_params,
207
- }
208
-
209
- # Initialize the Amazon Bedrock runtime client
210
- model_id = "amazon.nova-pro-v1:0"
211
- client = ChatBedrock(model_id=model_id)
212
-
213
- # Invoke the model and extract the response body.
214
- response = client.invoke(json.dumps(native_request))
215
- model_response = response.content
216
-
217
- return model_response
218
-
219
-
220
- # User Query
221
- query = "Which optimizer was used when training the models?"
222
-
223
- # Generate embeddings for the query
224
- query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)
225
-
226
- # Search for the nearest neighbors in the vector database
227
- distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)
228
-
229
- # Check the result (matched chunks)
230
- result.flatten()
231
-
232
- # Retrieve the matched items
233
- matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]
234
-
235
- # Generate RAG response with Amazon Nova
236
- response = invoke_nova_multimodal(query, matched_items)
237
-
238
- display.Markdown(response)
239
-
240
- # List of queries (Replace with any query of your choice)
241
- other_queries = ["How long were the base and big models trained?",
242
- "Which optimizer was used when training the models?",
243
- "What is the position-wise feed-forward neural network mentioned in the paper?",
244
- "What is the BLEU score of the model in English to German translation (EN-DE)?",
245
- "How is the scaled-dot-product attention is calculated?",
246
- ]
247
-
248
- query = other_queries[0] # Replace with any query from the list above
249
-
250
- # Generate embeddings for the query
251
- query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)
252
-
253
- # Search for the nearest neighbors in the vector database
254
- distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)
255
-
256
- # Retrieve the matched items
257
- matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]
258
-
259
- # Generate RAG response with Amazon Nova
260
- response = invoke_nova_multimodal(query, matched_items)
261
-
262
- # Display the response
263
- display.Markdown(response)
264
-
265
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/categorized/page3.py DELETED
@@ -1,62 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import tabula
4
- import pymupdf
5
- import os
6
- from tqdm import tqdm
7
-
8
-
9
- def extract_tables_pymupdf(pdf_path):
10
- """Extract tables using PyMuPDF (alternative method)"""
11
- try:
12
- doc = pymupdf.open(pdf_path)
13
- all_tables = []
14
-
15
- for page_num in range(len(doc)):
16
- page = doc[page_num]
17
- tables = page.find_tables()
18
-
19
- for table in tables:
20
- # Extract table data
21
- table_data = table.extract()
22
- if table_data:
23
- # Convert to DataFrame
24
- df = pd.DataFrame(table_data[1:], columns=table_data[0])
25
- all_tables.append({
26
- 'page': page_num + 1,
27
- 'dataframe': df
28
- })
29
-
30
- doc.close()
31
- return all_tables
32
- except Exception as e:
33
- st.error(f"Error extracting tables with PyMuPDF: {e}")
34
- return []
35
-
36
- def main():
37
- st.title("PDF Table Extractor")
38
- st.write("Upload a PDF to extract all tables")
39
-
40
- temp_path = "temp_uploaded.pdf" # Define here
41
-
42
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
43
-
44
- if uploaded_file is not None:
45
- # Save uploaded file temporarily
46
- with open(temp_path, "wb") as f:
47
- f.write(uploaded_file.getbuffer())
48
-
49
- # Using PyMuPDF
50
- tables = extract_tables_pymupdf(temp_path)
51
-
52
- if tables:
53
- st.success(f"Found {len(tables)} tables!")
54
-
55
- for idx, table_info in enumerate(tables):
56
- st.subheader(f"Table {idx + 1} (Page {table_info['page']})")
57
- df = table_info['dataframe']
58
- st.dataframe(df, use_container_width=True)
59
-
60
- # Clean up temp file
61
- if os.path.exists(temp_path):
62
- os.remove(temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/categorized/page4.py DELETED
@@ -1,5 +0,0 @@
1
- import streamlit as st
2
- from pathlib import Path
3
-
4
- def main():
5
- st.write(f'# {Path(__file__).parent.name} - {Path(__file__).name}')
 
 
 
 
 
 
src/pages/pages/categorized/page5.py DELETED
@@ -1,5 +0,0 @@
1
- import streamlit as st
2
- from pathlib import Path
3
-
4
- def main():
5
- st.write(f'# {Path(__file__).parent.name} - {Path(__file__).name}')
 
 
 
 
 
 
src/pages/pages/categorized/page6.py DELETED
@@ -1,671 +0,0 @@
1
- import os
2
- import re
3
- import json
4
- import tempfile
5
- import zipfile
6
- from io import BytesIO
7
- import fitz # PyMuPDF
8
- import cv2
9
- import numpy as np
10
-
11
- import streamlit as st
12
- import pandas as pd
13
- import requests
14
- import base64
15
- from typing import Dict, Any, Optional
16
- from collections import defaultdict
17
-
18
- API_KEY = "AIzaSyCD5_sTXRhr4cpBrM08V7UhWNNc1KmaI9I"
19
- API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
20
-
21
- SCHEMA = {
22
- "type": "OBJECT",
23
- "properties": {
24
- "material_name": {"type": "STRING"},
25
- "material_abbreviation": {"type": "STRING"},
26
- "mechanical_properties": {
27
- "type": "ARRAY",
28
- "items": {
29
- "type": "OBJECT",
30
- "properties": {
31
- "section": {"type": "STRING"},
32
- "property_name": {"type": "STRING"},
33
- "value": {"type": "STRING"},
34
- "unit": {"type": "STRING"},
35
- "english": {"type": "STRING"},
36
- "test_condition": {"type": "STRING"},
37
- "comments": {"type": "STRING"}
38
- },
39
- "required": ["section", "property_name", "value", "english", "comments"]
40
- }
41
- }
42
- }
43
- }
44
-
45
- def make_abbreviation(name: str) -> str:
46
- """Create a simple abbreviation from the material name."""
47
- if not name:
48
- return "UNKNOWN"
49
- words = name.split()
50
- abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
51
- return abbr or name[:6].upper()
52
-
53
- DPI = 300
54
- CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
55
-
56
- def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
57
- """Calls Gemini API with PDF bytes"""
58
- try:
59
- encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
60
- mime_type = "application/pdf"
61
- except Exception as e:
62
- st.error(f"Error encoding PDF: {e}")
63
- return None
64
-
65
- prompt = (
66
- "You are an expert materials scientist. From the attached PDF, extract the material name, "
67
- "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
68
- "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
69
- "For each property, you MUST extract:\n"
70
- "- property_name\n- value (or range)\n- unit\n"
71
- "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
72
- "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
73
- "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
74
- )
75
-
76
- payload = {
77
- "contents": [{
78
- "parts": [
79
- {"text": prompt},
80
- {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
81
- ]
82
- }],
83
- "generationConfig": {
84
- "temperature": 0,
85
- "responseMimeType": "application/json",
86
- "responseSchema": SCHEMA
87
- }
88
- }
89
-
90
- try:
91
- r = requests.post(API_URL, json=payload, timeout=300)
92
- r.raise_for_status()
93
- data = r.json()
94
-
95
- candidates = data.get("candidates", [])
96
- if not candidates:
97
- return None
98
-
99
- parts = candidates[0].get("content", {}).get("parts", [])
100
- json_text = None
101
- for p in parts:
102
- t = p.get("text", "")
103
- if t.strip().startswith("{"):
104
- json_text = t
105
- break
106
-
107
- return json.loads(json_text) if json_text else None
108
- except Exception as e:
109
- st.error(f"Gemini API Error: {e}")
110
- return None
111
-
112
- def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
113
- """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
114
- mat_name = data.get("material_name", "") or ""
115
- mat_abbr = data.get("material_abbreviation", "") or ""
116
-
117
- if not mat_abbr:
118
- mat_abbr = make_abbreviation(mat_name)
119
-
120
- rows = []
121
- for item in data.get("mechanical_properties", []):
122
- rows.append({
123
- "material_name": mat_name,
124
- "material_abbreviation": mat_abbr,
125
- "section": item.get("section", "") or "Mechanical",
126
- "property_name": item.get("property_name", "") or "Unknown property",
127
- "value": item.get("value", "") or "N/A",
128
- "unit": item.get("unit", "") or "",
129
- "english": item.get("english", "") or "",
130
- "test_condition": item.get("test_condition", "") or "",
131
- "comments": item.get("comments", "") or "",
132
- })
133
- return pd.DataFrame(rows)
134
-
135
- # --- IMAGE EXTRACTION LOGIC ---
136
- def get_page_image(page):
137
- pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
138
- img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
139
- return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
140
-
141
- def is_valid_plot_geometry(binary_crop):
142
- h, w = binary_crop.shape
143
- if h < 100 or w < 100:
144
- return False
145
- ink_density = cv2.countNonZero(binary_crop) / (w * h)
146
- if ink_density > 0.35:
147
- return False
148
- h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 4, 1))
149
- v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 4))
150
- has_h = cv2.countNonZero(cv2.erode(binary_crop, h_kernel, iterations=1)) > 0
151
- has_v = cv2.countNonZero(cv2.erode(binary_crop, v_kernel, iterations=1)) > 0
152
- return has_h or has_v
153
-
154
- def merge_boxes(rects):
155
- if not rects:
156
- return []
157
- rects = sorted(rects, key=lambda r: r[2] * r[3], reverse=True)
158
- merged = []
159
- for r in rects:
160
- rx, ry, rw, rh = r
161
- if not any(rx >= m[0]-15 and ry >= m[1]-15 and rx+rw <= m[0]+m[2]+15 and ry+rh <= m[1]+m[3]+15 for m in merged):
162
- merged.append(r)
163
- return merged
164
-
165
- def extract_images(pdf_doc):
166
- """Extract plot images from PDF using improved logic"""
167
- grouped_data = defaultdict(lambda: {"page": 0, "image_data": []})
168
- PADDING = 30
169
-
170
- for page_num, page in enumerate(pdf_doc, start=1):
171
- img_bgr = get_page_image(page)
172
- gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
173
- _, binary = cv2.threshold(gray, 225, 255, cv2.THRESH_BINARY_INV)
174
- kernel = np.ones((10, 10), np.uint8)
175
- dilated = cv2.dilate(binary, kernel, iterations=1)
176
- contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
177
-
178
- candidates = []
179
- page_h, page_w = gray.shape
180
- for cnt in contours:
181
- x, y, w, h = cv2.boundingRect(cnt)
182
- if 0.03 < (w * h) / (page_w * page_h) < 0.8:
183
- if is_valid_plot_geometry(binary[y:y+h, x:x+w]):
184
- candidates.append((x, y, w, h))
185
-
186
- final_rects = merge_boxes(candidates)
187
- blocks = page.get_text("blocks")
188
-
189
- for (cx, cy, cw, ch) in final_rects:
190
- best_caption = f"Figure on Page {page_num} (Unlabeled)"
191
- min_dist = float('inf')
192
- for b in blocks:
193
- text = b[4].strip()
194
- if CAP_RE.match(text):
195
- cap_y = b[1] * (DPI/72)
196
- dist = cap_y - (cy + ch)
197
- if 0 < dist < (page_h * 0.3) and dist < min_dist:
198
- best_caption = text.replace('\n', ' ')
199
- min_dist = dist
200
-
201
- x1, y1 = max(0, cx - PADDING), max(0, cy - PADDING)
202
- x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
203
- crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
204
-
205
- # Store image data in memory instead of saving to disk
206
- _, buffer = cv2.imencode('.png', crop)
207
- img_bytes = buffer.tobytes()
208
-
209
- fname = f"pg{page_num}_{cx}_{cy}.png"
210
-
211
- grouped_data[best_caption]["page"] = page_num
212
- grouped_data[best_caption]["image_data"].append({
213
- "filename": fname,
214
- "bytes": img_bytes,
215
- "array": crop
216
- })
217
-
218
- results = [{"caption": k, "page": v["page"], "image_data": v["image_data"]} for k, v in grouped_data.items()]
219
- return results
220
-
221
- def create_zip(results, include_json=True):
222
- """Create a zip file with images and optional JSON"""
223
- buf = BytesIO()
224
- with zipfile.ZipFile(buf, "w") as z:
225
- if include_json:
226
- json_data = [{"caption": r["caption"], "page": r["page"],
227
- "image_count": len(r["image_data"])} for r in results]
228
- z.writestr("plot_data.json", json.dumps(json_data, indent=4))
229
-
230
- for item in results:
231
- for img_data in item['image_data']:
232
- z.writestr(img_data['filename'], img_data['bytes'])
233
-
234
- buf.seek(0)
235
- return buf.getvalue()
236
-
237
- def input_form():
238
- PROPERTY_CATEGORIES = {
239
- "Polymer": [
240
- "Thermal",
241
- "Mechanical",
242
- "Processing",
243
- "Physical",
244
- "Descriptive",
245
- ],
246
- "Fiber": [
247
- "Mechanical",
248
- "Physical",
249
- "Thermal",
250
- "Descriptive",
251
- ],
252
- "Composite": [
253
- "Mechanical",
254
- "Thermal",
255
- "Processing",
256
- "Physical",
257
- "Descriptive",
258
- "Composition / Reinforcement",
259
- "Architecture / Structure",
260
- ],
261
- }
262
-
263
- PROPERTY_NAMES = {
264
- "Polymer": {
265
- "Thermal": [
266
- "Glass transition temperature (Tg)",
267
- "Melting temperature (Tm)",
268
- "Crystallization temperature (Tc)",
269
- "Degree of crystallinity",
270
- "Decomposition temperature",
271
- ],
272
- "Mechanical": [
273
- "Tensile modulus",
274
- "Tensile strength",
275
- "Elongation at break",
276
- "Flexural modulus",
277
- "Impact strength",
278
- ],
279
- "Processing": [
280
- "Melt flow index (MFI)",
281
- "Processing temperature",
282
- "Cooling rate",
283
- "Mold shrinkage",
284
- ],
285
- "Physical": [
286
- "Density",
287
- "Specific gravity",
288
- ],
289
- "Descriptive": [
290
- "Material grade",
291
- "Manufacturer",
292
- ],
293
- },
294
-
295
- "Fiber": {
296
- "Mechanical": [
297
- "Tensile modulus",
298
- "Tensile strength",
299
- "Strain to failure",
300
- ],
301
- "Physical": [
302
- "Density",
303
- "Fiber diameter",
304
- ],
305
- "Thermal": [
306
- "Decomposition temperature",
307
- ],
308
- "Descriptive": [
309
- "Fiber type",
310
- "Surface treatment",
311
- ],
312
- },
313
-
314
- "Composite": {
315
- "Mechanical": [
316
- "Longitudinal modulus (E1)",
317
- "Transverse modulus (E2)",
318
- "Shear modulus (G12)",
319
- "Poissons ratio (V12)",
320
- "Tensile strength (fiber direction)",
321
- "Interlaminar shear strength",
322
- ],
323
- "Thermal": [
324
- "Glass transition temperature (matrix)",
325
- "Coefficient of thermal expansion (CTE)",
326
- ],
327
- "Processing": [
328
- "Curing temperature",
329
- "Curing pressure",
330
- ],
331
- "Physical": [
332
- "Density",
333
- ],
334
- "Descriptive": [
335
- "Laminate type",
336
- ],
337
- "Composition / Reinforcement": [
338
- "Fiber volume fraction",
339
- "Fiber weight fraction",
340
- "Fiber type",
341
- "Matrix type",
342
- ],
343
- "Architecture / Structure": [
344
- "Weave type",
345
- "Ply orientation",
346
- "Number of plies",
347
- "Stacking sequence",
348
- ],
349
- },
350
- }
351
-
352
- st.title("Materials Property Input Form")
353
-
354
- material_class = st.selectbox(
355
- "Select Material Class",
356
- ("Polymer", "Fiber", "Composite"),
357
- index=None,
358
- placeholder="Choose material class",
359
- )
360
-
361
- if material_class:
362
- property_category = st.selectbox(
363
- "Select Property Category",
364
- PROPERTY_CATEGORIES[material_class],
365
- index=None,
366
- placeholder="Choose property category",
367
- )
368
- else:
369
- property_category = None
370
-
371
- if material_class and property_category:
372
- property_name = st.selectbox(
373
- "Select Property",
374
- PROPERTY_NAMES[material_class][property_category],
375
- index=None,
376
- placeholder="Choose property",
377
- )
378
- else:
379
- property_name = None
380
-
381
- if material_class and property_category and property_name:
382
- with st.form("user_input"):
383
- st.subheader("Enter Data")
384
-
385
- material_name = st.text_input("Material Name")
386
- material_abbr = st.text_input("Material Abbreviation")
387
-
388
- value = st.text_input("Value")
389
- unit = st.text_input("Unit (SI)")
390
- english = st.text_input("English Units")
391
- test_condition = st.text_input("Test Condition")
392
- comments = st.text_area("Comments")
393
-
394
- submitted = st.form_submit_button("Submit")
395
-
396
- if submitted:
397
- if not (material_name and value):
398
- st.error("Material name and value are required.")
399
-
400
- else:
401
- Input_db = pd.DataFrame([{
402
- "material_class": material_class,
403
- "material_name": material_name,
404
- "material_abbreviation": material_abbr,
405
- "section": property_category,
406
- "property_name": property_name,
407
- "value": value,
408
- "unit": unit,
409
- "english_units": english,
410
- "test_condition": test_condition,
411
- "comments": comments
412
- }])
413
-
414
- st.success("Property added successfully")
415
- st.dataframe(Input_db)
416
-
417
- if "user_uploaded_data" not in st.session_state:
418
- st.session_state["user_uploaded_data"] = Input_db
419
- return
420
- else:
421
- st.session_state["user_uploaded_data"] = pd.concat(
422
- [st.session_state["user_uploaded_data"], Input_db],
423
- ignore_index=True
424
- )
425
-
426
- return
427
-
428
- def main():
429
- st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
430
-
431
-
432
- if 'image_results' not in st.session_state:
433
- st.session_state.image_results = []
434
- if 'pdf_processed' not in st.session_state:
435
- st.session_state.pdf_processed = False
436
- if 'current_pdf_name' not in st.session_state:
437
- st.session_state.current_pdf_name = None
438
- if 'form_submitted' not in st.session_state:
439
- st.session_state.form_submitted = False
440
- if 'pdf_data_extracted' not in st.session_state:
441
- st.session_state.pdf_data_extracted = False
442
- if 'pdf_extracted_df' not in st.session_state:
443
- st.session_state.pdf_extracted_df = pd.DataFrame()
444
-
445
-
446
- prev_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
447
- input_form()
448
- curr_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
449
-
450
- if curr_uploaded_count > prev_uploaded_count:
451
- st.session_state.form_submitted = True
452
-
453
- st.title("PDF Material Data & Plot Extractor")
454
-
455
- uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
456
-
457
- if not uploaded_file:
458
-
459
- st.info("Upload a PDF to extract material data and plots")
460
- st.session_state.pdf_processed = False
461
- st.session_state.current_pdf_name = None
462
- st.session_state.image_results = []
463
- st.session_state.form_submitted = False
464
- st.session_state.pdf_data_extracted = False
465
- st.session_state.pdf_extracted_df = pd.DataFrame()
466
- return
467
-
468
-
469
- paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
470
-
471
- if st.session_state.current_pdf_name != uploaded_file.name:
472
- st.session_state.pdf_processed = False
473
- st.session_state.current_pdf_name = uploaded_file.name
474
- st.session_state.image_results = []
475
- st.session_state.form_submitted = False
476
-
477
- if st.session_state.form_submitted:
478
- st.session_state.form_submitted = False
479
- st.info("A Form was submitted. But your previous extracted data has been added already. If you want to extract more data/plots" \
480
- "upload again")
481
- tab1, tab2 = st.tabs(["Material Data", "Extracted Plots"])
482
- with tab1:
483
- st.info("Material data from form has been added to database.")
484
- with tab2:
485
- st.info("Plots already extracted")
486
- return
487
-
488
- tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
489
-
490
- with tempfile.TemporaryDirectory() as tmpdir:
491
- pdf_path = os.path.join(tmpdir, uploaded_file.name)
492
- with open(pdf_path, "wb") as f:
493
- f.write(uploaded_file.getbuffer())
494
-
495
- with tab1:
496
- st.subheader("Material Properties Data")
497
-
498
- # Only call Gemini once per PDF
499
- if not st.session_state.pdf_data_extracted:
500
- with st.spinner(" Extracting material data..."):
501
- with open(pdf_path, "rb") as f:
502
- pdf_bytes = f.read()
503
-
504
- data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
505
-
506
- if data:
507
- df = convert_to_dataframe(data)
508
- if not df.empty:
509
- st.session_state.pdf_extracted_df = df
510
- st.session_state.pdf_data_extracted = True
511
- st.session_state.pdf_extracted_meta = data # optional: keep raw meta
512
- else:
513
- st.warning("No data extracted")
514
- else:
515
- st.error("Failed to extract data from PDF")
516
- # After extraction, or when rerunning, use stored data
517
- df = st.session_state.pdf_extracted_df
518
-
519
- if not df.empty:
520
- data = st.session_state.get("pdf_extracted_meta", {})
521
- st.success(f" Extracted {len(df)} properties")
522
-
523
- col1, col2 = st.columns(2)
524
- with col1:
525
- st.metric("Material", data.get("material_name", "N/A"))
526
- with col2:
527
- st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
528
-
529
- st.dataframe(df, use_container_width=True, height=400)
530
- st.subheader("Assign Material Category")
531
-
532
- extracted_material_class = st.selectbox(
533
- "Select category for this material",
534
- ["Polymer", "Fiber", "Composite"],
535
- index=None,
536
- placeholder="Required before adding to database"
537
- )
538
- if st.button(" Add to Database"):
539
- if not extracted_material_class:
540
- st.error("Please select a material category before adding.")
541
- else:
542
- df["material_class"] = extracted_material_class
543
- # Optional: add material_type for Page 1 filtering
544
- df["material_type"] = extracted_material_class
545
-
546
- if "user_uploaded_data" not in st.session_state:
547
- st.session_state["user_uploaded_data"] = df
548
- else:
549
- st.session_state["user_uploaded_data"] = pd.concat(
550
- [st.session_state["user_uploaded_data"], df],
551
- ignore_index=True
552
- )
553
-
554
- st.success(f"Added to {extracted_material_class} database!")
555
-
556
- csv = df.to_csv(index=False)
557
- st.download_button(
558
- "⬇ Download CSV",
559
- data=csv,
560
- file_name=f"{paper_id}_data.csv",
561
- mime="text/csv"
562
- )
563
-
564
-
565
- with tab2:
566
- st.subheader("Extracted Plot Images")
567
-
568
- if not st.session_state.pdf_processed:
569
- with st.spinner(" Extracting plots from PDF..."):
570
- doc = fitz.open(pdf_path)
571
- st.session_state.image_results = extract_images(doc)
572
- doc.close()
573
- st.session_state.pdf_processed = True
574
-
575
- if st.session_state.image_results:
576
- subtab1, subtab2 = st.tabs([" Images", " JSON Preview"])
577
-
578
- with subtab1:
579
- st.success(f" Extracted {len(st.session_state.image_results)} plots")
580
-
581
- col_img, col_json, col_all = st.columns(3)
582
-
583
- with col_img:
584
- img_zip = create_zip(st.session_state.image_results, include_json=False)
585
- st.download_button(
586
- " Download Images Only",
587
- data=img_zip,
588
- file_name=f"{paper_id}_images.zip",
589
- mime="application/zip",
590
- use_container_width=True,
591
- key="download_images"
592
- )
593
-
594
- with col_json:
595
- json_data = [{"caption": r["caption"], "page": r["page"],
596
- "image_count": len(r["image_data"])} for r in st.session_state.image_results]
597
- st.download_button(
598
- " Download JSON",
599
- data=json.dumps(json_data, indent=4),
600
- file_name=f"{paper_id}_metadata.json",
601
- mime="application/json",
602
- use_container_width=True,
603
- key="download_json_top"
604
- )
605
-
606
- with col_all:
607
- full_zip = create_zip(st.session_state.image_results, include_json=True)
608
- st.download_button(
609
- " Download All",
610
- data=full_zip,
611
- file_name=f"{paper_id}_complete.zip",
612
- mime="application/zip",
613
- use_container_width=True,
614
- key="download_all"
615
- )
616
-
617
- st.divider()
618
-
619
- results_copy = st.session_state.image_results.copy()
620
-
621
- for idx in range(len(results_copy)):
622
- if idx >= len(st.session_state.image_results):
623
- break
624
-
625
- r = st.session_state.image_results[idx]
626
-
627
- with st.container(border=True):
628
- col_cap, col_btn = st.columns([0.85, 0.15])
629
- col_cap.markdown(f"**Page {r['page']}** {r['caption']}")
630
-
631
- if col_btn.button(" Delete", key=f"del_g_{idx}_{r['page']}"):
632
- del st.session_state.image_results[idx]
633
- st.rerun()
634
-
635
- image_data_list = r['image_data']
636
- if image_data_list and len(image_data_list) > 0:
637
- cols = st.columns(len(image_data_list))
638
- for p_idx in range(len(image_data_list)):
639
- if p_idx >= len(st.session_state.image_results[idx]['image_data']):
640
- break
641
-
642
- img_data = st.session_state.image_results[idx]['image_data'][p_idx]
643
- with cols[p_idx]:
644
- st.image(img_data['array'], width=img_width, channels="BGR")
645
- if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
646
- del st.session_state.image_results[idx]['image_data'][p_idx]
647
- if len(st.session_state.image_results[idx]['image_data']) == 0:
648
- del st.session_state.image_results[idx]
649
- st.rerun()
650
-
651
- with subtab2:
652
- st.subheader("Metadata Preview")
653
- json_data = [{"caption": r["caption"], "page": r["page"],
654
- "image_count": len(r["image_data"]),
655
- "images": [img["filename"] for img in r["image_data"]]}
656
- for r in st.session_state.image_results]
657
-
658
- st.download_button(
659
- " Download JSON",
660
- data=json.dumps(json_data, indent=4),
661
- file_name=f"{paper_id}_metadata.json",
662
- mime="application/json",
663
- key="download_json_bottom"
664
- )
665
-
666
- st.json(json_data)
667
- else:
668
- st.warning("No plots found in PDF")
669
-
670
- if __name__ == "__main__":
671
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/pages/categorized/propgraph.jpg DELETED
Binary file (83.4 kB)