gvlktejaswi commited on
Commit
bad95eb
·
verified ·
1 Parent(s): b443b15

Upload 24 files

Browse files
.gitattributes CHANGED
@@ -40,3 +40,4 @@ src/images/images/Epoxy[[:space:]]+[[:space:]]44%[[:space:]]Carbon[[:space:]]fib
40
  src/images/images/Home.png filter=lfs diff=lfs merge=lfs -text
41
  src/images/images/logo.png filter=lfs diff=lfs merge=lfs -text
42
  src/images/images/us_deptenergy.jpg filter=lfs diff=lfs merge=lfs -text
 
 
40
  src/images/images/Home.png filter=lfs diff=lfs merge=lfs -text
41
  src/images/images/logo.png filter=lfs diff=lfs merge=lfs -text
42
  src/images/images/us_deptenergy.jpg filter=lfs diff=lfs merge=lfs -text
43
+ src/pages/categorized/ESS-min.jpg filter=lfs diff=lfs merge=lfs -text
src/pages/3_Categorized_Search.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image # Used to open and handle image files
3
+
4
+
5
+
6
+ def load_page1():
7
+ from pages.categorized.page1 import main
8
+ main()
9
+
10
+ # def load_page2():
11
+ # from pages.categorized.page2 import main
12
+ # main()
13
+
14
+
15
+
16
+ load_page1()
17
+
18
+
19
+ #st.sidebar.button('Material Type', on_click=load_page1)
20
+ #st.sidebar.button('Trade Name', on_click=load_page2)
21
+ #st.sidebar.button('Manufacturer Name', on_click=load_page3)
22
+
23
+ #image = Image.open('logo.png')
24
+ #st.image(image, caption='a', use_container_width=True)
25
+ st.sidebar.write("")
26
+ st.sidebar.write("")
27
+ st.sidebar.write("")
28
+ st.sidebar.write("")
29
+ st.sidebar.write("")
30
+ st.sidebar.write("")
31
+ st.sidebar.write("")
32
+ st.sidebar.write("")
33
+ st.sidebar.image("logo.png", caption=" ", width=150)
34
+
src/pages/5_Upload_Data.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+
4
+
5
+ # def load_page1():
6
+ # from pages.categorized.page1 import main
7
+ # main()
8
+
9
+ def load_page6():
10
+ from pages.categorized.page6 import main
11
+ main()
12
+
13
+ def load_page3():
14
+ from pages.categorized.page3 import main
15
+ main()
16
+
17
+ load_page6()
18
+ #load_page3()
src/pages/categorized/Backend/Pdf_DataExtraction.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import requests
5
+ import base64
6
+ import json
7
+ import os
8
+ from typing import Dict, Any, Optional
9
+
10
+
11
+
12
+
13
+ # Backend PDF extraction Logic
14
+ API_KEY = "AIzaSyAruLR2WyiaL9PquOXOhHF4wMn7tfYZWek"
15
+ API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
16
+
17
+ SCHEMA = {
18
+ "type": "OBJECT",
19
+ "properties": {
20
+ "material_name": {"type": "STRING"},
21
+ "material_abbreviation": {"type": "STRING"},
22
+ "mechanical_properties": {
23
+ "type": "ARRAY",
24
+ "items": {
25
+ "type": "OBJECT",
26
+ "properties": {
27
+ "section": {"type": "STRING"},
28
+ "property_name": {"type": "STRING"},
29
+ "value": {"type": "STRING"},
30
+ "unit": {"type": "STRING"},
31
+ "english": {"type": "STRING"},
32
+ "test_condition": {"type": "STRING"},
33
+ "comments": {"type": "STRING"}
34
+ },
35
+ "required": ["section", "property_name", "value", "english", "comments"]
36
+ }
37
+ }
38
+ }
39
+ }
40
+
41
+ # === GEMINI CALL FUNCTION ===
42
+ def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
43
+ """Calls Gemini API with PDF bytes"""
44
+ try:
45
+ encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
46
+ mime_type = "application/pdf"
47
+ except Exception as e:
48
+ st.error(f"Error encoding PDF: {e}")
49
+ return None
50
+
51
+ prompt = (
52
+ "Extract all experimental data from this research paper. "
53
+ "For each measurement, extract: "
54
+ "- experiment_name, measured_value, unit, uncertainty, method, conditions. "
55
+ "Return as JSON."
56
+ # "You are an expert materials scientist. From the attached PDF, extract the material name, "
57
+ # "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
58
+ # "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
59
+ # "For each property, you MUST extract:\n"
60
+ # "- property_name\n- value (or range)\n- unit\n"
61
+ # "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
62
+ # "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
63
+ # "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
64
+ )
65
+
66
+ payload = {
67
+ "contents": [
68
+ {
69
+ "parts": [
70
+ {"text": prompt},
71
+ {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
72
+ ]
73
+ }
74
+ ],
75
+ "generationConfig": {
76
+ "temperature": 0,
77
+ "responseMimeType": "application/json",
78
+ "responseSchema": SCHEMA
79
+ }
80
+ }
81
+
82
+ try:
83
+ r = requests.post(API_URL, json=payload, timeout=300)
84
+ r.raise_for_status()
85
+ data = r.json()
86
+
87
+ candidates = data.get("candidates", [])
88
+ if not candidates:
89
+ return None
90
+
91
+ parts = candidates[0].get("content", {}).get("parts", [])
92
+ json_text = None
93
+ for p in parts:
94
+ t = p.get("text", "")
95
+ if t.strip().startswith("{"):
96
+ json_text = t
97
+ break
98
+
99
+ return json.loads(json_text) if json_text else None
100
+ except Exception as e:
101
+ st.error(f"Gemini API Error: {e}")
102
+ return None
103
+
104
+
105
+ def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
106
+ """Convert extracted JSON to DataFrame"""
107
+ rows = []
108
+ for item in data.get("mechanical_properties", []):
109
+ rows.append({
110
+ "material_name": data.get("material_name", ""),
111
+ "material_abbreviation": data.get("material_abbreviation", ""),
112
+ "section": item.get("section", ""),
113
+ "property_name": item.get("property_name", ""),
114
+ "value": item.get("value", ""),
115
+ "unit": item.get("unit", ""),
116
+ "english": item.get("english", ""),
117
+ "test_condition": item.get("test_condition", ""),
118
+ "comments": item.get("comments", "")
119
+ })
120
+ return pd.DataFrame(rows)
src/pages/categorized/Backend/Pdf_ImageExtraction.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import math
5
+ import tempfile
6
+ import fitz # PyMuPDF
7
+ import cv2
8
+ import numpy as np
9
+ from PIL import Image
10
+ import streamlit as st
11
+
12
+ # -------------------
13
+ # Config
14
+ # -------------------
15
+ DPI = 300
16
+ OUT_DIR = "outputs"
17
+
18
+ KEEP_ONLY_STRESS_STRAIN = False
19
+
20
+ CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
21
+ SS_KW = re.compile(
22
+ r"(stress\s*[-–]?\s*strain|stress|strain|tensile|MPa|GPa|kN|yield|elongation)",
23
+ re.IGNORECASE
24
+ )
25
+
26
+ # -------------------
27
+ # Render helpers
28
+ # -------------------
29
+ def render_page(page, dpi=DPI):
30
+ mat = fitz.Matrix(dpi/72, dpi/72)
31
+ pix = page.get_pixmap(matrix=mat, alpha=False)
32
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
33
+ return img, mat
34
+
35
+ def pdf_to_px_bbox(bbox_pdf, mat):
36
+ x0, y0, x1, y1 = bbox_pdf
37
+ sx, sy = mat.a, mat.d
38
+ return (int(float(x0) * sx), int(float(y0) * sy), int(float(x1) * sx), int(float(y1) * sy))
39
+
40
+ def safe_crop_px(pil_img, box):
41
+ if not isinstance(box, (tuple, list)):
42
+ return None
43
+ if len(box) == 1 and isinstance(box[0], (tuple, list)) and len(box[0]) == 4:
44
+ box = box[0]
45
+ if len(box) != 4:
46
+ return None
47
+
48
+ x0, y0, x1, y1 = box
49
+ if any(isinstance(v, (tuple, list)) for v in (x0, y0, x1, y1)):
50
+ return None
51
+
52
+ try:
53
+ x0 = int(x0)
54
+ y0 = int(y0)
55
+ x1 = int(x1)
56
+ y1 = int(y1)
57
+ except (TypeError, ValueError):
58
+ return None
59
+
60
+ if x1 < x0:
61
+ x0, x1 = x1, x0
62
+ if y1 < y0:
63
+ y0, y1 = y1, y0
64
+
65
+ W, H = pil_img.size
66
+ x0 = max(0, min(W, x0))
67
+ x1 = max(0, min(W, x1))
68
+ y0 = max(0, min(H, y0))
69
+ y1 = max(0, min(H, y1))
70
+ if x1 <= x0 or y1 <= y0:
71
+ return None
72
+ return pil_img.crop((x0, y0, x1, y1))
73
+
74
+ # -------------------
75
+ # Captions
76
+ # -------------------
77
+ def find_caption_blocks(page):
78
+ caps = []
79
+ blocks = page.get_text("blocks")
80
+ for b in blocks:
81
+ x0, y0, x1, y1, text = b[0], b[1], b[2], b[3], b[4]
82
+ t = " ".join(str(text).strip().split())
83
+ if CAP_RE.match(t):
84
+ caps.append({"bbox": (x0, y0, x1, y1), "text": t})
85
+ return caps
86
+
87
+ # -------------------
88
+ # Dedupe: dHash
89
+ # -------------------
90
+ def dhash64(pil_img):
91
+ gray = pil_img.convert("L").resize((9, 8), Image.LANCZOS)
92
+ pixels = list(gray.getdata())
93
+ bits = 0
94
+ for r in range(8):
95
+ for c in range(8):
96
+ left = pixels[r * 9 + c]
97
+ right = pixels[r * 9 + c + 1]
98
+ bits = (bits << 1) | (1 if left > right else 0)
99
+ return bits
100
+
101
+ # -------------------
102
+ # Rejectors
103
+ # -------------------
104
+ def has_colorbar_like_strip(pil_img):
105
+ img = np.array(pil_img)
106
+ if img.ndim != 3:
107
+ return False
108
+ H, W, _ = img.shape
109
+ if W < 250 or H < 150:
110
+ return False
111
+ strip_w = max(18, int(0.07 * W))
112
+ strip = img[:, W-strip_w:W, :]
113
+ q = (strip // 24).reshape(-1, 3)
114
+ uniq = np.unique(q, axis=0)
115
+ return len(uniq) > 70
116
+
117
+ def texture_score(pil_img):
118
+ gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
119
+ lap = cv2.Laplacian(gray, cv2.CV_64F)
120
+ return float(lap.var())
121
+
122
+ def is_mostly_legend(pil_img):
123
+ gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
124
+ bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
125
+ bw = cv2.medianBlur(bw, 3)
126
+ H, W = bw.shape
127
+ fill = float(np.count_nonzero(bw)) / float(H * W)
128
+ return (0.03 < fill < 0.18) and (min(H, W) < 260)
129
+
130
+ # -------------------
131
+ # Plot detection
132
+ # -------------------
133
+ def detect_axes_lines(pil_img):
134
+ gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
135
+ edges = cv2.Canny(gray, 50, 150)
136
+ H, W = gray.shape
137
+ min_len = int(0.28 * min(H, W))
138
+
139
+ lines = cv2.HoughLinesP(
140
+ edges, 1, np.pi/180,
141
+ threshold=90,
142
+ minLineLength=min_len,
143
+ maxLineGap=14
144
+ )
145
+ if lines is None:
146
+ return None, None
147
+
148
+ horizontals, verticals = [], []
149
+ for x1, y1, x2, y2 in lines[:, 0]:
150
+ dx, dy = abs(x2-x1), abs(y2-y1)
151
+ length = math.hypot(dx, dy)
152
+ if dy < 18 and dx > 0.35 * W:
153
+ horizontals.append((length, (x1, y1, x2, y2)))
154
+ if dx < 18 and dy > 0.35 * H:
155
+ verticals.append((length, (x1, y1, x2, y2)))
156
+
157
+ if not horizontals or not verticals:
158
+ return None, None
159
+
160
+ horizontals.sort(key=lambda t: t[0], reverse=True)
161
+ verticals.sort(key=lambda t: t[0], reverse=True)
162
+ return horizontals[0][1], verticals[0][1]
163
+
164
+ def axis_intersection_ok(x_axis, y_axis, W, H):
165
+ xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
166
+ ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
167
+ if not (0 <= xa_y < H and 0 <= ya_x < W):
168
+ return False
169
+ if ya_x > int(0.95 * W) or xa_y < int(0.05 * H):
170
+ return False
171
+ return True
172
+
173
+ def tick_text_presence_score(pil_img, x_axis, y_axis):
174
+ img = np.array(pil_img)
175
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
176
+ bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
177
+ bw = cv2.medianBlur(bw, 3)
178
+
179
+ H, W = gray.shape
180
+ xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
181
+ ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
182
+
183
+ y0a = max(0, xa_y - 40)
184
+ y1a = min(H, xa_y + 110)
185
+ x_roi = bw[y0a:y1a, 0:W]
186
+
187
+ x0b = max(0, ya_x - 180)
188
+ x1b = min(W, ya_x + 50)
189
+ y_roi = bw[0:H, x0b:x1b]
190
+
191
+ def count_small_components(mask):
192
+ num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
193
+ cnt = 0
194
+ for i in range(1, num):
195
+ x, y, w, h, area = stats[i]
196
+ if 4 <= w <= 150 and 4 <= h <= 150 and 20 <= area <= 5000:
197
+ cnt += 1
198
+ return cnt
199
+
200
+ return count_small_components(x_roi) + count_small_components(y_roi)
201
+
202
+ def is_real_plot(pil_img):
203
+ if has_colorbar_like_strip(pil_img):
204
+ return False
205
+ if is_mostly_legend(pil_img):
206
+ return False
207
+
208
+ x_axis, y_axis = detect_axes_lines(pil_img)
209
+ if x_axis is None or y_axis is None:
210
+ return False
211
+
212
+ arr = np.array(pil_img)
213
+ H, W = arr.shape[0], arr.shape[1]
214
+ if not axis_intersection_ok(x_axis, y_axis, W, H):
215
+ return False
216
+
217
+ if texture_score(pil_img) > 2200:
218
+ return False
219
+
220
+ score = tick_text_presence_score(pil_img, x_axis, y_axis)
221
+ return score >= 18
222
+
223
+ # -------------------
224
+ # Candidate boxes in a region
225
+ # -------------------
226
+ def connected_components_boxes(pil_img):
227
+ img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
228
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
229
+ mask = (gray < 245).astype(np.uint8) * 255
230
+ mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((7, 7), np.uint8), iterations=2)
231
+ num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
232
+
233
+ boxes = []
234
+ for i in range(1, num):
235
+ x, y, w, h, area = stats[i]
236
+ boxes.append((int(area), (int(x), int(y), int(x + w), int(y + h))))
237
+ boxes.sort(key=lambda t: t[0], reverse=True)
238
+ return boxes
239
+
240
+ def expand_box(box, W, H, left=0.10, right=0.06, top=0.06, bottom=0.18):
241
+ x0, y0, x1, y1 = box
242
+ bw = x1 - x0
243
+ bh = y1 - y0
244
+ ex0 = max(0, int(x0 - left * bw))
245
+ ex1 = min(W, int(x1 + right * bw))
246
+ ey0 = max(0, int(y0 - top * bh))
247
+ ey1 = min(H, int(y1 + bottom * bh))
248
+ return (ex0, ey0, ex1, ey1)
249
+
250
+ # -------------------
251
+ # Crop plot from caption
252
+ # -------------------
253
+ def crop_plot_from_caption(page_img, cap_bbox_pdf, mat):
254
+ cap_px = pdf_to_px_bbox(cap_bbox_pdf, mat)
255
+ cap_y0 = cap_px[1]
256
+ cap_y1 = cap_px[3]
257
+
258
+ W, H = page_img.size
259
+ search_top = max(0, cap_y0 - int(0.95 * H))
260
+ search_bot = min(H, cap_y1 + int(0.20 * H))
261
+ region = safe_crop_px(page_img, (0, search_top, W, search_bot))
262
+ if region is None:
263
+ return None
264
+
265
+ comps = connected_components_boxes(region)
266
+ best = None
267
+ best_area = -1
268
+
269
+ for area, box in comps[:35]:
270
+ x0, y0, x1, y1 = box
271
+ bw = x1 - x0
272
+ bh = y1 - y0
273
+ if bw < 220 or bh < 180:
274
+ continue
275
+
276
+ exp = expand_box(box, region.size[0], region.size[1])
277
+ cand = safe_crop_px(region, exp)
278
+ if cand is None:
279
+ continue
280
+
281
+ if not is_real_plot(cand):
282
+ continue
283
+
284
+ if area > best_area:
285
+ best_area = area
286
+ best = cand
287
+
288
+ return best
289
+
290
+ # -------------------
291
+ # Streamlit UI
292
+ # -------------------
293
+ def run_extraction(pdf_path, paper_id="uploaded_paper"):
294
+ out_paper = os.path.join(OUT_DIR, paper_id)
295
+ out_imgs = os.path.join(out_paper, "plots_with_axes")
296
+ os.makedirs(out_imgs, exist_ok=True)
297
+
298
+ doc = fitz.open(pdf_path)
299
+ results = []
300
+ seen = set()
301
+ saved = 0
302
+
303
+ for p in range(len(doc)):
304
+ page = doc[p]
305
+ caps = find_caption_blocks(page)
306
+ if not caps:
307
+ continue
308
+
309
+ page_img, mat = render_page(page, dpi=DPI)
310
+
311
+ for cap in caps:
312
+ cap_text = cap["text"]
313
+
314
+ if KEEP_ONLY_STRESS_STRAIN and not SS_KW.search(cap_text):
315
+ continue
316
+
317
+ fig = crop_plot_from_caption(page_img, cap["bbox"], mat)
318
+ if fig is None:
319
+ continue
320
+
321
+ if fig.size[0] > 8 and fig.size[1] > 8:
322
+ fig = fig.crop((2, 2, fig.size[0]-2, fig.size[1]-2))
323
+
324
+ try:
325
+ h = dhash64(fig)
326
+ except Exception:
327
+ continue
328
+
329
+ if h in seen:
330
+ continue
331
+ seen.add(h)
332
+
333
+ img_name = f"p{p+1:02d}_{saved:04d}.png"
334
+ img_path = os.path.join(out_imgs, img_name)
335
+ fig.save(img_path)
336
+
337
+ results.append({
338
+ "page": p + 1,
339
+ "caption": cap_text,
340
+ "image": img_path
341
+ })
342
+ saved += 1
343
+
344
+ out_json = os.path.join(out_paper, "plots_with_axes.json")
345
+ with open(out_json, "w", encoding="utf-8") as f:
346
+ json.dump(results, f, indent=2, ensure_ascii=False)
347
+
348
+ return results, out_json
349
+
350
+ def main():
351
+ st.set_page_config(page_title="Research Paper Plot Extractor", layout="wide")
352
+ st.title(" Plot Extractor (Upload PDF)")
353
+
354
+ uploaded = st.file_uploader("Upload a research paper PDF", type=["pdf"])
355
+ if not uploaded:
356
+ st.info("Upload a PDF to extract plots.")
357
+ return
358
+
359
+ paper_id = os.path.splitext(uploaded.name)[0].replace(" ", "_")
360
+
361
+ with tempfile.TemporaryDirectory() as tmpdir:
362
+ pdf_path = os.path.join(tmpdir, uploaded.name)
363
+ with open(pdf_path, "wb") as f:
364
+ f.write(uploaded.read())
365
+
366
+ with st.spinner("Extracting plots..."):
367
+ results, out_json = run_extraction(pdf_path, paper_id=paper_id)
368
+
369
+ st.success(f"Extracted {len(results)} plots.")
370
+
371
+ # Show images + captions
372
+ for r in results:
373
+ st.markdown(f"**Page {r['page']}** — {r['caption']}")
374
+ st.image(r["image"], use_container_width=True)
375
+ st.divider()
376
+
377
+ # JSON viewer + download
378
+ st.subheader("JSON Output")
379
+ st.json(results)
380
+
381
+ with open(out_json, "rb") as f:
382
+ st.download_button(
383
+ "Download JSON",
384
+ data=f,
385
+ file_name=os.path.basename(out_json),
386
+ mime="application/json"
387
+ )
388
+
389
+ if __name__ == "__main__":
390
+ main()
src/pages/categorized/ESS-min.jpg ADDED

Git LFS Details

  • SHA256: ff58c9304c39dc90ca15b516a1f1ec385ea60a9829c5dd9eb698ee1f82778eb7
  • Pointer size: 131 Bytes
  • Size of remote file: 356 kB
src/pages/categorized/Temp_Backup.py ADDED
@@ -0,0 +1,736 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import math
5
+ import tempfile
6
+ import fitz # PyMuPDF
7
+ import cv2
8
+ import numpy as np
9
+ from PIL import Image
10
+ import streamlit as st
11
+ import pandas as pd
12
+ import requests
13
+ import base64
14
+ from typing import Dict, Any, Optional
15
+
16
+ API_KEY = "AIzaSyAruLR2WyiaL9PquOXOhHF4wMn7tfYZWek"
17
+ API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
18
+
19
+ SCHEMA = {
20
+ "type": "OBJECT",
21
+ "properties": {
22
+ "material_name": {"type": "STRING"},
23
+ "material_abbreviation": {"type": "STRING"},
24
+ "mechanical_properties": {
25
+ "type": "ARRAY",
26
+ "items": {
27
+ "type": "OBJECT",
28
+ "properties": {
29
+ "section": {"type": "STRING"},
30
+ "property_name": {"type": "STRING"},
31
+ "value": {"type": "STRING"},
32
+ "unit": {"type": "STRING"},
33
+ "english": {"type": "STRING"},
34
+ "test_condition": {"type": "STRING"},
35
+ "comments": {"type": "STRING"}
36
+ },
37
+ "required": ["section", "property_name", "value", "english", "comments"]
38
+ }
39
+ }
40
+ }
41
+ }
42
+ def make_abbreviation(name: str) -> str:
43
+ """Create a simple abbreviation from the material name."""
44
+ if not name:
45
+ return "UNKNOWN"
46
+ words = name.split()
47
+ abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
48
+ return abbr or name[:6].upper()
49
+
50
+ DPI = 300
51
+ OUT_DIR = "outputs"
52
+ KEEP_ONLY_STRESS_STRAIN = False
53
+ CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
54
+ SS_KW = re.compile(
55
+ r"(stress\s*[-–]?\s*strain|stress|strain|tensile|MPa|GPa|kN|yield|elongation)",
56
+ re.IGNORECASE
57
+ )
58
+
59
+ def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
60
+ """Calls Gemini API with PDF bytes"""
61
+ try:
62
+ encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
63
+ mime_type = "application/pdf"
64
+ except Exception as e:
65
+ st.error(f"Error encoding PDF: {e}")
66
+ return None
67
+
68
+ prompt = (
69
+ "You are an expert materials scientist. From the attached PDF, extract the material name, "
70
+ "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
71
+ "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
72
+ "For each property, you MUST extract:\n"
73
+ "- property_name\n- value (or range)\n- unit\n"
74
+ "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
75
+ "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
76
+ "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
77
+ )
78
+
79
+ payload = {
80
+ "contents": [{
81
+ "parts": [
82
+ {"text": prompt},
83
+ {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
84
+ ]
85
+ }],
86
+ "generationConfig": {
87
+ "temperature": 0,
88
+ "responseMimeType": "application/json",
89
+ "responseSchema": SCHEMA
90
+ }
91
+ }
92
+
93
+ try:
94
+ r = requests.post(API_URL, json=payload, timeout=300)
95
+ r.raise_for_status()
96
+ data = r.json()
97
+
98
+ candidates = data.get("candidates", [])
99
+ if not candidates:
100
+ return None
101
+
102
+ parts = candidates[0].get("content", {}).get("parts", [])
103
+ json_text = None
104
+ for p in parts:
105
+ t = p.get("text", "")
106
+ if t.strip().startswith("{"):
107
+ json_text = t
108
+ break
109
+
110
+ return json.loads(json_text) if json_text else None
111
+ except Exception as e:
112
+ st.error(f"Gemini API Error: {e}")
113
+ return None
114
+
115
+ # def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
116
+ # """Convert extracted JSON to DataFrame"""
117
+ # rows = []
118
+ # for item in data.get("mechanical_properties", []):
119
+ # rows.append({
120
+ # "material_name": data.get("material_name", ""),
121
+ # "material_abbreviation": data.get("material_abbreviation", ""),
122
+ # "section": item.get("section", ""),
123
+ # "property_name": item.get("property_name", ""),
124
+ # "value": item.get("value", ""),
125
+ # "unit": item.get("unit", ""),
126
+ # "english": item.get("english", ""),
127
+ # "test_condition": item.get("test_condition", ""),
128
+ # "comments": item.get("comments", "")
129
+ # })
130
+ # return pd.DataFrame(rows)
131
+ def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
132
+ """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
133
+ mat_name = data.get("material_name", "") or ""
134
+ mat_abbr = data.get("material_abbreviation", "") or ""
135
+
136
+ if not mat_abbr:
137
+ mat_abbr = make_abbreviation(mat_name)
138
+
139
+ rows = []
140
+ for item in data.get("mechanical_properties", []):
141
+ rows.append({
142
+ "material_name": mat_name,
143
+ "material_abbreviation": mat_abbr,
144
+ "section": item.get("section", "") or "Mechanical",
145
+ "property_name": item.get("property_name", "") or "Unknown property",
146
+ "value": item.get("value", "") or "N/A",
147
+ "unit": item.get("unit", "") or "",
148
+ "english": item.get("english", "") or "",
149
+ "test_condition": item.get("test_condition", "") or "",
150
+ "comments": item.get("comments", "") or "",
151
+ })
152
+ return pd.DataFrame(rows)
153
+
154
+ def render_page(page, dpi=DPI):
155
+ mat = fitz.Matrix(dpi/72, dpi/72)
156
+ pix = page.get_pixmap(matrix=mat, alpha=False)
157
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
158
+ return img, mat
159
+
160
+ def pdf_to_px_bbox(bbox_pdf, mat):
161
+ x0, y0, x1, y1 = bbox_pdf
162
+ sx, sy = mat.a, mat.d
163
+ return (int(float(x0) * sx), int(float(y0) * sy), int(float(x1) * sx), int(float(y1) * sy))
164
+
165
+ def safe_crop_px(pil_img, box):
166
+ if not isinstance(box, (tuple, list)):
167
+ return None
168
+ if len(box) == 1 and isinstance(box[0], (tuple, list)) and len(box[0]) == 4:
169
+ box = box[0]
170
+ if len(box) != 4:
171
+ return None
172
+
173
+ x0, y0, x1, y1 = box
174
+ if any(isinstance(v, (tuple, list)) for v in (x0, y0, x1, y1)):
175
+ return None
176
+
177
+ try:
178
+ x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
179
+ except (TypeError, ValueError):
180
+ return None
181
+
182
+ if x1 < x0: x0, x1 = x1, x0
183
+ if y1 < y0: y0, y1 = y1, y0
184
+
185
+ W, H = pil_img.size
186
+ x0 = max(0, min(W, x0))
187
+ x1 = max(0, min(W, x1))
188
+ y0 = max(0, min(H, y0))
189
+ y1 = max(0, min(H, y1))
190
+ if x1 <= x0 or y1 <= y0:
191
+ return None
192
+ return pil_img.crop((x0, y0, x1, y1))
193
+
194
+ def find_caption_blocks(page):
195
+ caps = []
196
+ blocks = page.get_text("blocks")
197
+ for b in blocks:
198
+ x0, y0, x1, y1, text = b[0], b[1], b[2], b[3], b[4]
199
+ t = " ".join(str(text).strip().split())
200
+ if CAP_RE.match(t):
201
+ caps.append({"bbox": (x0, y0, x1, y1), "text": t})
202
+ return caps
203
+
204
+ def dhash64(pil_img):
205
+ gray = pil_img.convert("L").resize((9, 8), Image.LANCZOS)
206
+ pixels = list(gray.getdata())
207
+ bits = 0
208
+ for r in range(8):
209
+ for c in range(8):
210
+ left = pixels[r * 9 + c]
211
+ right = pixels[r * 9 + c + 1]
212
+ bits = (bits << 1) | (1 if left > right else 0)
213
+ return bits
214
+
215
+ def has_colorbar_like_strip(pil_img):
216
+ img = np.array(pil_img)
217
+ if img.ndim != 3:
218
+ return False
219
+ H, W, _ = img.shape
220
+ if W < 250 or H < 150:
221
+ return False
222
+ strip_w = max(18, int(0.07 * W))
223
+ strip = img[:, W-strip_w:W, :]
224
+ q = (strip // 24).reshape(-1, 3)
225
+ uniq = np.unique(q, axis=0)
226
+ return len(uniq) > 70
227
+
228
+ def texture_score(pil_img):
229
+ gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
230
+ lap = cv2.Laplacian(gray, cv2.CV_64F)
231
+ return float(lap.var())
232
+
233
+ def is_mostly_legend(pil_img):
234
+ gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
235
+ bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
236
+ bw = cv2.medianBlur(bw, 3)
237
+ H, W = bw.shape
238
+ fill = float(np.count_nonzero(bw)) / float(H * W)
239
+ return (0.03 < fill < 0.18) and (min(H, W) < 260)
240
+
241
+ def detect_axes_lines(pil_img):
242
+ gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
243
+ edges = cv2.Canny(gray, 50, 150)
244
+ H, W = gray.shape
245
+ min_len = int(0.28 * min(H, W))
246
+
247
+ lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=90, minLineLength=min_len, maxLineGap=14)
248
+ if lines is None:
249
+ return None, None
250
+
251
+ horizontals, verticals = [], []
252
+ for x1, y1, x2, y2 in lines[:, 0]:
253
+ dx, dy = abs(x2-x1), abs(y2-y1)
254
+ length = math.hypot(dx, dy)
255
+ if dy < 18 and dx > 0.35 * W:
256
+ horizontals.append((length, (x1, y1, x2, y2)))
257
+ if dx < 18 and dy > 0.35 * H:
258
+ verticals.append((length, (x1, y1, x2, y2)))
259
+
260
+ if not horizontals or not verticals:
261
+ return None, None
262
+
263
+ horizontals.sort(key=lambda t: t[0], reverse=True)
264
+ verticals.sort(key=lambda t: t[0], reverse=True)
265
+ return horizontals[0][1], verticals[0][1]
266
+
267
+ def axis_intersection_ok(x_axis, y_axis, W, H):
268
+ xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
269
+ ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
270
+ if not (0 <= xa_y < H and 0 <= ya_x < W):
271
+ return False
272
+ if ya_x > int(0.95 * W) or xa_y < int(0.05 * H):
273
+ return False
274
+ return True
275
+
276
+ def tick_text_presence_score(pil_img, x_axis, y_axis):
277
+ img = np.array(pil_img)
278
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
279
+ bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
280
+ bw = cv2.medianBlur(bw, 3)
281
+
282
+ H, W = gray.shape
283
+ xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
284
+ ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
285
+
286
+ y0a = max(0, xa_y - 40)
287
+ y1a = min(H, xa_y + 110)
288
+ x_roi = bw[y0a:y1a, 0:W]
289
+
290
+ x0b = max(0, ya_x - 180)
291
+ x1b = min(W, ya_x + 50)
292
+ y_roi = bw[0:H, x0b:x1b]
293
+
294
+ def count_small_components(mask):
295
+ num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
296
+ cnt = 0
297
+ for i in range(1, num):
298
+ x, y, w, h, area = stats[i]
299
+ if 4 <= w <= 150 and 4 <= h <= 150 and 20 <= area <= 5000:
300
+ cnt += 1
301
+ return cnt
302
+
303
+ return count_small_components(x_roi) + count_small_components(y_roi)
304
+
305
+ def is_real_plot(pil_img):
306
+ if has_colorbar_like_strip(pil_img):
307
+ return False
308
+ if is_mostly_legend(pil_img):
309
+ return False
310
+
311
+ x_axis, y_axis = detect_axes_lines(pil_img)
312
+ if x_axis is None or y_axis is None:
313
+ return False
314
+
315
+ arr = np.array(pil_img)
316
+ H, W = arr.shape[0], arr.shape[1]
317
+ if not axis_intersection_ok(x_axis, y_axis, W, H):
318
+ return False
319
+
320
+ if texture_score(pil_img) > 2200:
321
+ return False
322
+
323
+ score = tick_text_presence_score(pil_img, x_axis, y_axis)
324
+ return score >= 18
325
+
326
+ def connected_components_boxes(pil_img):
327
+ img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
328
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
329
+ mask = (gray < 245).astype(np.uint8) * 255
330
+ mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((7, 7), np.uint8), iterations=2)
331
+ num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
332
+
333
+ boxes = []
334
+ for i in range(1, num):
335
+ x, y, w, h, area = stats[i]
336
+ boxes.append((int(area), (int(x), int(y), int(x + w), int(y + h))))
337
+ boxes.sort(key=lambda t: t[0], reverse=True)
338
+ return boxes
339
+
340
+ def expand_box(box, W, H, left=0.10, right=0.06, top=0.06, bottom=0.18):
341
+ x0, y0, x1, y1 = box
342
+ bw = x1 - x0
343
+ bh = y1 - y0
344
+ ex0 = max(0, int(x0 - left * bw))
345
+ ex1 = min(W, int(x1 + right * bw))
346
+ ey0 = max(0, int(y0 - top * bh))
347
+ ey1 = min(H, int(y1 + bottom * bh))
348
+ return (ex0, ey0, ex1, ey1)
349
+
350
+ def crop_plot_from_caption(page_img, cap_bbox_pdf, mat):
351
+ cap_px = pdf_to_px_bbox(cap_bbox_pdf, mat)
352
+ cap_y0 = cap_px[1]
353
+ cap_y1 = cap_px[3]
354
+
355
+ W, H = page_img.size
356
+ search_top = max(0, cap_y0 - int(0.95 * H))
357
+ search_bot = min(H, cap_y1 + int(0.20 * H))
358
+ region = safe_crop_px(page_img, (0, search_top, W, search_bot))
359
+ if region is None:
360
+ return None
361
+
362
+ comps = connected_components_boxes(region)
363
+ best = None
364
+ best_area = -1
365
+
366
+ for area, box in comps[:35]:
367
+ x0, y0, x1, y1 = box
368
+ bw = x1 - x0
369
+ bh = y1 - y0
370
+ if bw < 220 or bh < 180:
371
+ continue
372
+
373
+ exp = expand_box(box, region.size[0], region.size[1])
374
+ cand = safe_crop_px(region, exp)
375
+ if cand is None:
376
+ continue
377
+
378
+ if not is_real_plot(cand):
379
+ continue
380
+
381
+ if area > best_area:
382
+ best_area = area
383
+ best = cand
384
+
385
+ return best
386
+
387
+ def extract_images(pdf_path, paper_id="uploaded_paper"):
388
+ """Extract plot images from PDF"""
389
+ out_paper = os.path.join(OUT_DIR, paper_id)
390
+ out_imgs = os.path.join(out_paper, "plots_with_axes")
391
+ os.makedirs(out_imgs, exist_ok=True)
392
+
393
+ doc = fitz.open(pdf_path)
394
+ results = []
395
+ seen = set()
396
+ saved = 0
397
+
398
+ for p in range(len(doc)):
399
+ page = doc[p]
400
+ caps = find_caption_blocks(page)
401
+ if not caps:
402
+ continue
403
+
404
+ page_img, mat = render_page(page, dpi=DPI)
405
+
406
+ for cap in caps:
407
+ cap_text = cap["text"]
408
+
409
+ if KEEP_ONLY_STRESS_STRAIN and not SS_KW.search(cap_text):
410
+ continue
411
+
412
+ fig = crop_plot_from_caption(page_img, cap["bbox"], mat)
413
+ if fig is None:
414
+ continue
415
+
416
+ if fig.size[0] > 8 and fig.size[1] > 8:
417
+ fig = fig.crop((2, 2, fig.size[0]-2, fig.size[1]-2))
418
+
419
+ try:
420
+ h = dhash64(fig)
421
+ except Exception:
422
+ continue
423
+
424
+ if h in seen:
425
+ continue
426
+ seen.add(h)
427
+
428
+ img_name = f"p{p+1:02d}_{saved:04d}.png"
429
+ img_path = os.path.join(out_imgs, img_name)
430
+ fig.save(img_path)
431
+
432
+ results.append({
433
+ "page": p + 1,
434
+ "caption": cap_text,
435
+ "image": img_path
436
+ })
437
+ saved += 1
438
+
439
+ return results
440
+
441
+ def input_form():
442
+ PROPERTY_CATEGORIES = {
443
+ "Polymer": [
444
+ "Thermal",
445
+ "Mechanical",
446
+ "Processing",
447
+ "Physical",
448
+ "Descriptive",
449
+ ],
450
+ "Fiber": [
451
+ "Mechanical",
452
+ "Physical",
453
+ "Thermal",
454
+ "Descriptive",
455
+ ],
456
+ "Composite": [
457
+ "Mechanical",
458
+ "Thermal",
459
+ "Processing",
460
+ "Physical",
461
+ "Descriptive",
462
+ "Composition / Reinforcement",
463
+ "Architecture / Structure",
464
+ ],
465
+ }
466
+
467
+ PROPERTY_NAMES = {
468
+ "Polymer": {
469
+ "Thermal": [
470
+ "Glass transition temperature (Tg)",
471
+ "Melting temperature (Tm)",
472
+ "Crystallization temperature (Tc)",
473
+ "Degree of crystallinity",
474
+ "Decomposition temperature",
475
+ ],
476
+ "Mechanical": [
477
+ "Tensile modulus",
478
+ "Tensile strength",
479
+ "Elongation at break",
480
+ "Flexural modulus",
481
+ "Impact strength",
482
+ ],
483
+ "Processing": [
484
+ "Melt flow index (MFI)",
485
+ "Processing temperature",
486
+ "Cooling rate",
487
+ "Mold shrinkage",
488
+ ],
489
+ "Physical": [
490
+ "Density",
491
+ "Specific gravity",
492
+ ],
493
+ "Descriptive": [
494
+ "Material grade",
495
+ "Manufacturer",
496
+ ],
497
+ },
498
+
499
+ "Fiber": {
500
+ "Mechanical": [
501
+ "Tensile modulus",
502
+ "Tensile strength",
503
+ "Strain to failure",
504
+ ],
505
+ "Physical": [
506
+ "Density",
507
+ "Fiber diameter",
508
+ ],
509
+ "Thermal": [
510
+ "Decomposition temperature",
511
+ ],
512
+ "Descriptive": [
513
+ "Fiber type",
514
+ "Surface treatment",
515
+ ],
516
+ },
517
+
518
+ "Composite": {
519
+ "Mechanical": [
520
+ "Longitudinal modulus (E1)",
521
+ "Transverse modulus (E2)",
522
+ "Shear modulus (G12)",
523
+ "Poissons ratio (V12)",
524
+ "Tensile strength (fiber direction)",
525
+ "Interlaminar shear strength",
526
+ ],
527
+ "Thermal": [
528
+ "Glass transition temperature (matrix)",
529
+ "Coefficient of thermal expansion (CTE)",
530
+ ],
531
+ "Processing": [
532
+ "Curing temperature",
533
+ "Curing pressure",
534
+ ],
535
+ "Physical": [
536
+ "Density",
537
+ ],
538
+ "Descriptive": [
539
+ "Laminate type",
540
+ ],
541
+ "Composition / Reinforcement": [
542
+ "Fiber volume fraction",
543
+ "Fiber weight fraction",
544
+ "Fiber type",
545
+ "Matrix type",
546
+ ],
547
+ "Architecture / Structure": [
548
+ "Weave type",
549
+ "Ply orientation",
550
+ "Number of plies",
551
+ "Stacking sequence",
552
+ ],
553
+ },
554
+ }
555
+
556
+
557
+
558
+ st.title("Materials Property Input Form")
559
+
560
+ material_class = st.selectbox(
561
+ "Select Material Class",
562
+ ("Polymer", "Fiber", "Composite"),
563
+ index=None,
564
+ placeholder="Choose material class",
565
+ )
566
+
567
+ if material_class:
568
+ property_category = st.selectbox(
569
+ "Select Property Category",
570
+ PROPERTY_CATEGORIES[material_class],
571
+ index=None,
572
+ placeholder="Choose property category",
573
+ )
574
+ else:
575
+ property_category = None
576
+
577
+ if material_class and property_category:
578
+ property_name = st.selectbox(
579
+ "Select Property",
580
+ PROPERTY_NAMES[material_class][property_category],
581
+ index=None,
582
+ placeholder="Choose property",
583
+ )
584
+ else:
585
+ property_name = None
586
+
587
+ if material_class and property_category and property_name:
588
+ with st.form("user_input"):
589
+ st.subheader("Enter Data")
590
+
591
+ material_name = st.text_input("Material Name")
592
+ material_abbr = st.text_input("Material Abbreviation")
593
+
594
+ value = st.text_input("Value")
595
+ unit = st.text_input("Unit (SI)")
596
+ english = st.text_input("English Units")
597
+ test_condition = st.text_input("Test Condition")
598
+ comments = st.text_area("Comments")
599
+
600
+ submitted = st.form_submit_button("Submit")
601
+
602
+ if submitted:
603
+ if not (material_name and value):
604
+ st.error("Material name and value are required.")
605
+ else:
606
+ Input_db = pd.DataFrame([{
607
+ "material_class": material_class,
608
+ "material_name": material_name,
609
+ "material_abbreviation": material_abbr,
610
+ "section": property_category,
611
+ "property_name": property_name,
612
+ "value": value,
613
+ "unit": unit,
614
+ "english_units": english,
615
+ "test_condition": test_condition,
616
+ "comments": comments
617
+ }])
618
+
619
+ st.success("Property added successfully")
620
+ st.dataframe(Input_db)
621
+
622
+
623
+ if "user_uploaded_data" not in st.session_state:
624
+ st.session_state["user_uploaded_data"] = Input_db
625
+ else:
626
+ st.session_state["user_uploaded_data"] = pd.concat(
627
+ [st.session_state["user_uploaded_data"], Input_db],
628
+ ignore_index=True
629
+ )
630
+ def main():
631
+ input_form()
632
+ st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
633
+ st.title("PDF Material Data & Plot Extractor")
634
+
635
+ uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
636
+
637
+ if not uploaded_file:
638
+ st.info("Upload a PDF to extract material data and plots")
639
+ return
640
+
641
+ paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
642
+
643
+ tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
644
+
645
+ with tempfile.TemporaryDirectory() as tmpdir:
646
+ pdf_path = os.path.join(tmpdir, uploaded_file.name)
647
+ with open(pdf_path, "wb") as f:
648
+ f.write(uploaded_file.getbuffer())
649
+
650
+ with tab1:
651
+ st.subheader("Material Properties Data")
652
+
653
+ with st.spinner(" Extracting material data..."):
654
+ with open(pdf_path, "rb") as f:
655
+ pdf_bytes = f.read()
656
+
657
+ data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
658
+
659
+ if data:
660
+ df = convert_to_dataframe(data)
661
+
662
+ if not df.empty:
663
+ st.success(f"Extracted {len(df)} properties")
664
+
665
+ col1, col2 = st.columns(2)
666
+ with col1:
667
+ st.metric("Material", data.get("material_name", "N/A"))
668
+ with col2:
669
+ st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
670
+
671
+ st.dataframe(df, use_container_width=True, height=400)
672
+ st.subheader("Assign Material Category")
673
+
674
+ extracted_material_class = st.selectbox(
675
+ "Select category for this material",
676
+ ["Polymer", "Fiber", "Composite"],
677
+ index=None,
678
+ placeholder="Required before adding to database"
679
+ )
680
+ if st.button(" Add to Database"):
681
+ if not extracted_material_class:
682
+ st.error("Please select a material category before adding.")
683
+ else:
684
+ df["material_class"] = extracted_material_class
685
+
686
+ if "user_uploaded_data" not in st.session_state:
687
+ st.session_state["user_uploaded_data"] = df
688
+ else:
689
+ st.session_state["user_uploaded_data"] = pd.concat(
690
+ [st.session_state["user_uploaded_data"], df],
691
+ ignore_index=True
692
+ )
693
+
694
+ st.success(f"Added to {extracted_material_class} database!")
695
+
696
+ # if st.button(" Add to Database"):
697
+ # if "user_uploaded_data" not in st.session_state:
698
+ # st.session_state["user_uploaded_data"] = df
699
+ # else:
700
+ # st.session_state["user_uploaded_data"] = pd.concat(
701
+ # [st.session_state["user_uploaded_data"], df],
702
+ # ignore_index=True
703
+ # )
704
+ # st.success("Added to database!")
705
+
706
+ csv = df.to_csv(index=False)
707
+ st.download_button(
708
+ "Download CSV",
709
+ data=csv,
710
+ file_name=f"{paper_id}_data.csv",
711
+ mime="text/csv"
712
+ )
713
+ else:
714
+ st.warning("No data extracted")
715
+ else:
716
+ st.error("Failed to extract data from PDF")
717
+
718
+ with tab2:
719
+ st.subheader("Extracted Plot Images")
720
+
721
+ with st.spinner(" Extracting plots from PDF..."):
722
+ image_results = extract_images(pdf_path, paper_id=paper_id)
723
+
724
+ if image_results:
725
+ st.success(f" Extracted {len(image_results)} plots")
726
+
727
+ for r in image_results:
728
+ st.markdown(f"**Page {r['page']}** — {r['caption']}")
729
+ st.image(r["image"], use_container_width=True)
730
+ st.divider()
731
+ else:
732
+ st.warning("No plots found in PDF")
733
+
734
+
735
+ if __name__ == "__main__":
736
+ main()
src/pages/categorized/__pycache__/page1.cpython-312.pyc ADDED
Binary file (4.86 kB). View file
 
src/pages/categorized/__pycache__/page1.cpython-313.pyc ADDED
Binary file (4.94 kB). View file
 
src/pages/categorized/__pycache__/page1.cpython-314.pyc ADDED
Binary file (9.83 kB). View file
 
src/pages/categorized/__pycache__/page2.cpython-312.pyc ADDED
Binary file (596 Bytes). View file
 
src/pages/categorized/__pycache__/page2.cpython-313.pyc ADDED
Binary file (596 Bytes). View file
 
src/pages/categorized/__pycache__/page2.cpython-314.pyc ADDED
Binary file (672 Bytes). View file
 
src/pages/categorized/__pycache__/page3.cpython-313.pyc ADDED
Binary file (596 Bytes). View file
 
src/pages/categorized/__pycache__/page3.cpython-314.pyc ADDED
Binary file (2.93 kB). View file
 
src/pages/categorized/__pycache__/page6.cpython-314.pyc ADDED
Binary file (34 kB). View file
 
src/pages/categorized/__pycache__/page6.cpython-314.pyc.2029864538672 ADDED
Binary file (8.01 kB). View file
 
src/pages/categorized/__pycache__/page6.cpython-314.pyc.2097035857760 ADDED
Binary file (1.22 kB). View file
 
src/pages/categorized/page1.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import re
5
+
6
+ def extract_matrix_fiber_from_abbr(abbr: str):
7
+ if not isinstance(abbr, str):
8
+ return None, None
9
+
10
+ text = abbr.lower()
11
+
12
+ matrix_map = {
13
+ "epoxy": "Epoxy",
14
+ "cyanate ester": "Cyanate Ester",
15
+ "cynate ester": "Cyanate Ester",
16
+ "polypropylene": "Polypropylene",
17
+ "pp": "Polypropylene",
18
+ "peek": "PEEK",
19
+ "pei": "PEI",
20
+ "nylon": "Nylon",
21
+ "pa6": "PA6",
22
+ "polyester": "Polyester",
23
+ "vinyl ester": "Vinyl Ester",
24
+ "phenolic": "Phenolic"
25
+ }
26
+
27
+ matrix = None
28
+ for key, val in matrix_map.items():
29
+ if key in text:
30
+ matrix = val
31
+ break
32
+
33
+ fiber_map = {
34
+ "carbon": "Carbon Fiber",
35
+ "glass": "Glass Fiber",
36
+ "e-glass": "E-Glass Fiber",
37
+ "s-glass": "S-Glass Fiber",
38
+ "aramid": "Aramid Fiber",
39
+ "kevlar": "Kevlar Fiber",
40
+ "basalt": "Basalt Fiber",
41
+ "natural": "Natural Fiber"
42
+ }
43
+
44
+ fiber = None
45
+ for key, val in fiber_map.items():
46
+ if key in text:
47
+ fiber = val
48
+ break
49
+
50
+ return matrix, fiber
51
+
52
+
53
+ def main():
54
+ st.set_page_config(layout="wide")
55
+
56
+ mat_section = st.sidebar.expander("Materials", expanded=False)
57
+ with mat_section:
58
+ thermo = mat_section.button("Composites")
59
+ polymers = mat_section.button("Polymers")
60
+ Fibers = mat_section.button("Fibers")
61
+
62
+ if "material_type" not in st.session_state:
63
+ st.session_state.material_type = "Composites"
64
+
65
+ if thermo:
66
+ st.session_state.material_type = "Composites"
67
+ elif polymers:
68
+ st.session_state.material_type = "Polymers"
69
+ elif Fibers:
70
+ st.session_state.material_type = "Fibers"
71
+
72
+ @st.cache_data
73
+ def load_data(material_type):
74
+ file_map = {
75
+ "Composites": "data/Composites_material_data.csv",
76
+ "Polymers": "data/polymers_material_data.csv",
77
+ "Fibers": "data/Fibers_material_data.csv",
78
+ }
79
+ return pd.read_csv(file_map[material_type])
80
+
81
+ csv_data = load_data(st.session_state.material_type)
82
+
83
+ # if "user_uploaded_data" in st.session_state:
84
+ # df = pd.concat([csv_data, st.session_state["user_uploaded_data"]], ignore_index=True)
85
+ # else:
86
+ # df = csv_data
87
+ # Normalize naming between pages
88
+ CLASS_MAP = {
89
+ "Polymers": "Polymer",
90
+ "Fibers": "Fiber",
91
+ "Composites": "Composite",
92
+ }
93
+
94
+ current_class = CLASS_MAP[st.session_state.material_type]
95
+
96
+ if "user_uploaded_data" in st.session_state:
97
+ user_df = st.session_state["user_uploaded_data"]
98
+
99
+ filtered_user_df = user_df[
100
+ user_df["material_class"] == current_class
101
+ ]
102
+
103
+ df = pd.concat([csv_data, filtered_user_df], ignore_index=True)
104
+ else:
105
+ df = csv_data
106
+
107
+
108
+ st.session_state["base_data"] = df
109
+
110
+ st.title("Materials DataSet")
111
+
112
+ materials_df = (
113
+ df[["material_abbreviation", "material_name"]]
114
+ .fillna("")
115
+ .drop_duplicates()
116
+ .reset_index(drop=True)
117
+ )
118
+
119
+ materials_df[["Matrix", "Fiber"]] = materials_df["material_abbreviation"].apply(
120
+ lambda x: pd.Series(extract_matrix_fiber_from_abbr(x))
121
+ )
122
+
123
+
124
+ col1, col2 = st.columns(2, vertical_alignment="center")
125
+
126
+ # st.subheader("Filter Composites")
127
+
128
+ # matrix_options = sorted(
129
+ # materials_df["Matrix"].dropna().unique()
130
+ # )
131
+
132
+ # fiber_options = sorted(
133
+ # materials_df["Fiber"].dropna().unique()
134
+ # )
135
+
136
+ # fcol1, fcol2 = st.columns(2)
137
+
138
+ # with fcol1:
139
+ # selected_matrix = st.selectbox(
140
+ # "Matrix Material",
141
+ # ["All"] + matrix_options
142
+ # )
143
+
144
+ # with fcol2:
145
+ # selected_fiber = st.selectbox(
146
+ # "Fiber Material",
147
+ # ["All"] + fiber_options
148
+ # )
149
+
150
+
151
+ # filtered_materials_df = materials_df.copy()
152
+
153
+ # if selected_matrix != "All":
154
+ # filtered_materials_df = filtered_materials_df[
155
+ # filtered_materials_df["Matrix"] == selected_matrix
156
+ # ]
157
+
158
+ # if selected_fiber != "All":
159
+ # filtered_materials_df = filtered_materials_df[
160
+ # filtered_materials_df["Fiber"] == selected_fiber
161
+ # ]
162
+
163
+
164
+ with col1:
165
+ st.write("Filter Composites")
166
+
167
+ selected_matrix = "All"
168
+ selected_fiber = "All"
169
+
170
+ if st.session_state.material_type == "Composites":
171
+
172
+
173
+ matrix_options = sorted(
174
+ materials_df["Matrix"].dropna().unique()
175
+ )
176
+
177
+ fiber_options = sorted(
178
+ materials_df["Fiber"].dropna().unique()
179
+ )
180
+
181
+ fcol1, fcol2 = st.columns(2)
182
+
183
+ with fcol1:
184
+ selected_matrix = st.selectbox(
185
+ "Matrix Material",
186
+ ["All"] + matrix_options
187
+ )
188
+
189
+ with fcol2:
190
+ selected_fiber = st.selectbox(
191
+ "Fiber Material",
192
+ ["All"] + fiber_options
193
+ )
194
+
195
+
196
+
197
+ filtered_materials_df = materials_df.copy()
198
+
199
+ if st.session_state.material_type == "Composites":
200
+ if selected_matrix != "All":
201
+ filtered_materials_df = filtered_materials_df[
202
+ filtered_materials_df["Matrix"] == selected_matrix
203
+ ]
204
+
205
+ if selected_fiber != "All":
206
+ filtered_materials_df = filtered_materials_df[
207
+ filtered_materials_df["Fiber"] == selected_fiber
208
+ ]
209
+
210
+ st.write("Select Material")
211
+ st.dataframe(
212
+ filtered_materials_df,
213
+ key="material_table",
214
+ selection_mode="single-cell",
215
+ on_select="rerun",
216
+ use_container_width=True,
217
+ height=260
218
+ )
219
+
220
+ def get_selected_value(df, key, column_name):
221
+ if key in st.session_state:
222
+ sel = st.session_state[key]["selection"]["cells"]
223
+ if sel:
224
+ row_idx = sel[0][0]
225
+ return df.iloc[row_idx][column_name]
226
+ return None
227
+
228
+
229
+ mat = get_selected_value(materials_df, "material_table", "material_abbreviation")
230
+
231
+ with col2:
232
+ st.write("Select Property")
233
+
234
+ if mat:
235
+ filtered_df = df[
236
+ (df["material_abbreviation"] == mat) &
237
+ (df["value"].notna()) &
238
+ (df["property_name"].notna())
239
+ ]
240
+ property_sel = st.selectbox(
241
+ "Type of Property",
242
+ filtered_df["section"].drop_duplicates()
243
+ )
244
+
245
+ properties_df = (
246
+ filtered_df[filtered_df["section"] == property_sel][["property_name", "section"]]
247
+ .drop_duplicates()
248
+ .reset_index(drop=True)
249
+ )
250
+ else:
251
+ filtered_df = df[df["value"].notna() & df["property_name"].notna()]
252
+ property_sel = st.selectbox(
253
+ "Type of Property",
254
+ filtered_df["section"].drop_duplicates()
255
+ )
256
+
257
+ properties_df = (
258
+ filtered_df[filtered_df["section"] == property_sel][["property_name", "section"]]
259
+ .drop_duplicates()
260
+ .reset_index(drop=True)
261
+ )
262
+
263
+ st.dataframe(
264
+ properties_df,
265
+ key="property_table",
266
+ selection_mode="single-cell",
267
+ on_select="rerun",
268
+ use_container_width=True,
269
+ height=260
270
+ )
271
+
272
+ prop = get_selected_value(properties_df, "property_table", "property_name")
273
+
274
+ st.write("")
275
+ if st.button("Search", disabled=not (mat and prop)):
276
+ st.write(f"**Material:** {mat}")
277
+ st.write(f"**Property:** {prop}")
278
+
279
+ result = df[
280
+ (df["material_abbreviation"] == mat) &
281
+ (df["property_name"] == prop) &
282
+ (df["value"].notna())
283
+ ]
284
+
285
+ if not result.empty:
286
+ st.subheader("Property Data")
287
+ st.dataframe(result.T, use_container_width=True)
288
+
289
+ st.subheader("Property Graph")
290
+ img_path = f"images/{mat}_{prop}.png"
291
+
292
+ try:
293
+ img = Image.open(img_path)
294
+ st.image(img, use_container_width=True, caption="Stress strain curve")
295
+ except FileNotFoundError:
296
+ st.write("")
297
+ # fallback_img = Image.open("pages/categorized/ESS-min.jpg")
298
+ # st.image(fallback_img, use_container_width=True, caption="Stress strain curve")
299
+
300
+ else:
301
+ st.warning("No data found for this material-property combination")
302
+
303
+
304
+
305
+
306
+
307
+
src/pages/categorized/page2.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ from PIL import Image
5
+ import boto3
6
+ import tabula
7
+ import faiss
8
+ import json
9
+ import base64
10
+ import pymupdf
11
+ import requests
12
+ import os
13
+ import logging
14
+ import numpy as np
15
+ import warnings
16
+ from tqdm import tqdm
17
+ from botocore.exceptions import ClientError
18
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
19
+ from IPython import display
20
+ from langchain_aws import ChatBedrock
21
+
22
+
23
+ from pathlib import Path
24
+
25
+ def main():
26
+
27
+
28
+
29
+
30
+ logger = logging.getLogger(__name__)
31
+ logger.setLevel(logging.ERROR)
32
+
33
+ warnings.filterwarnings("ignore")
34
+
35
+ def create_directories(base_dir):
36
+ directories = ["images", "text", "tables", "page_images"]
37
+ for dir in directories:
38
+ os.makedirs(os.path.join(base_dir, dir), exist_ok=True)
39
+
40
+
41
+ def process_tables(doc, page_num, base_dir, items):
42
+ try:
43
+ tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
44
+ if not tables:
45
+ return
46
+ for table_idx, table in enumerate(tables):
47
+ table_text = "\n".join([" | ".join(map(str, row)) for row in table.values])
48
+ table_file_name = f"{base_dir}/tables/{os.path.basename(filepath)}_table_{page_num}_{table_idx}.txt"
49
+ with open(table_file_name, 'w') as f:
50
+ f.write(table_text)
51
+ items.append({"page": page_num, "type": "table", "text": table_text, "path": table_file_name})
52
+ except Exception as e:
53
+ print(f"Error extracting tables from page {page_num}: {str(e)}")
54
+
55
+ doc = pymupdf.open(filepath)
56
+ num_pages = len(doc)
57
+ base_dir = "data"
58
+
59
+ # Creating the directories
60
+ create_directories(base_dir)
61
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200, length_function=len)
62
+ items = []
63
+
64
+ # Process each page of the PDF
65
+ for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
66
+ page = doc[page_num]
67
+ process_tables(doc, page_num, base_dir, items)
68
+
69
+ [i for i in items if i['type'] == 'table'][0]
70
+ # Generating Multimodal Embeddings using Amazon Titan Multimodal Embeddings model
71
+ def generate_multimodal_embeddings(prompt=None, image=None, output_embedding_length=384):
72
+ """
73
+ Invoke the Amazon Titan Multimodal Embeddings model using Amazon Bedrock runtime.
74
+
75
+ Args:
76
+ prompt (str): The text prompt to provide to the model.
77
+ image (str): A base64-encoded image data.
78
+ Returns:
79
+ str: The model's response embedding.
80
+ """
81
+ if not prompt and not image:
82
+ raise ValueError("Please provide either a text prompt, base64 image, or both as input")
83
+
84
+ # Initialize the Amazon Bedrock runtime client
85
+ client = boto3.client(service_name="bedrock-runtime")
86
+ model_id = "amazon.titan-embed-image-v1"
87
+
88
+ body = {"embeddingConfig": {"outputEmbeddingLength": output_embedding_length}}
89
+
90
+ if prompt:
91
+ body["inputText"] = prompt
92
+ if image:
93
+ body["inputImage"] = image
94
+
95
+ try:
96
+ response = client.invoke_model(
97
+ modelId=model_id,
98
+ body=json.dumps(body),
99
+ accept="application/json",
100
+ contentType="application/json"
101
+ )
102
+
103
+ # Process and return the response
104
+ result = json.loads(response.get("body").read())
105
+ return result.get("embedding")
106
+
107
+ except ClientError as err:
108
+ print(f"Couldn't invoke Titan embedding model. Error: {err.response['Error']['Message']}")
109
+ return None
110
+
111
+ # Set embedding vector dimension
112
+ embedding_vector_dimension = 384
113
+
114
+ # Count the number of each type of item
115
+ item_counts = {
116
+ 'text': sum(1 for item in items if item['type'] == 'text'),
117
+ 'table': sum(1 for item in items if item['type'] == 'table'),
118
+ 'image': sum(1 for item in items if item['type'] == 'image'),
119
+ 'page': sum(1 for item in items if item['type'] == 'page')
120
+ }
121
+
122
+ # Initialize counters
123
+ counters = dict.fromkeys(item_counts.keys(), 0)
124
+
125
+ # Generate embeddings for all items
126
+ with tqdm(
127
+ total=len(items),
128
+ desc="Generating embeddings",
129
+ bar_format=(
130
+ "{l_bar}{bar}| {n_fmt}/{total_fmt} "
131
+ "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
132
+ )
133
+ ) as pbar:
134
+
135
+ for item in items:
136
+ item_type = item['type']
137
+ counters[item_type] += 1
138
+
139
+ if item_type in ['text', 'table']:
140
+ # For text or table, use the formatted text representation
141
+ item['embedding'] = generate_multimodal_embeddings(prompt=item['text'],output_embedding_length=embedding_vector_dimension)
142
+ else:
143
+ # For images, use the base64-encoded image data
144
+ item['embedding'] = generate_multimodal_embeddings(image=item['image'], output_embedding_length=embedding_vector_dimension)
145
+
146
+ # Update the progress bar
147
+ pbar.set_postfix_str(f"Text: {counters['text']}/{item_counts['text']}, Table: {counters['table']}/{item_counts['table']}, Image: {counters['image']}/{item_counts['image']}")
148
+ pbar.update(1)
149
+
150
+ # All the embeddings
151
+ all_embeddings = np.array([item['embedding'] for item in items])
152
+
153
+ # Create FAISS Index
154
+ index = faiss.IndexFlatL2(embedding_vector_dimension)
155
+
156
+ # Clear any pre-existing index
157
+ index.reset()
158
+
159
+ # Add embeddings to the index
160
+ index.add(np.array(all_embeddings, dtype=np.float32))
161
+
162
+ # Generating RAG response with Amazon Nova
163
+ def invoke_nova_multimodal(prompt, matched_items):
164
+ """
165
+ Invoke the Amazon Nova model.
166
+ """
167
+
168
+
169
+ # Define your system prompt(s).
170
+ system_msg = [
171
+ { "text": """You are a helpful assistant for question answering.
172
+ The text context is relevant information retrieved.
173
+ The provided image(s) are relevant information retrieved."""}
174
+ ]
175
+
176
+ # Define one or more messages using the "user" and "assistant" roles.
177
+ message_content = []
178
+
179
+ for item in matched_items:
180
+ if item['type'] == 'text' or item['type'] == 'table':
181
+ message_content.append({"text": item['text']})
182
+ else:
183
+ message_content.append({"image": {
184
+ "format": "png",
185
+ "source": {"bytes": item['image']},
186
+ }
187
+ })
188
+
189
+
190
+ # Configure the inference parameters.
191
+ inf_params = {"max_new_tokens": 300,
192
+ "top_p": 0.9,
193
+ "top_k": 20}
194
+
195
+ # Define the final message list
196
+ message_list = [
197
+ {"role": "user", "content": message_content}
198
+ ]
199
+
200
+ # Adding the prompt to the message list
201
+ message_list.append({"role": "user", "content": [{"text": prompt}]})
202
+
203
+ native_request = {
204
+ "messages": message_list,
205
+ "system": system_msg,
206
+ "inferenceConfig": inf_params,
207
+ }
208
+
209
+ # Initialize the Amazon Bedrock runtime client
210
+ model_id = "amazon.nova-pro-v1:0"
211
+ client = ChatBedrock(model_id=model_id)
212
+
213
+ # Invoke the model and extract the response body.
214
+ response = client.invoke(json.dumps(native_request))
215
+ model_response = response.content
216
+
217
+ return model_response
218
+
219
+
220
+ # User Query
221
+ query = "Which optimizer was used when training the models?"
222
+
223
+ # Generate embeddings for the query
224
+ query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)
225
+
226
+ # Search for the nearest neighbors in the vector database
227
+ distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)
228
+
229
+ # Check the result (matched chunks)
230
+ result.flatten()
231
+
232
+ # Retrieve the matched items
233
+ matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]
234
+
235
+ # Generate RAG response with Amazon Nova
236
+ response = invoke_nova_multimodal(query, matched_items)
237
+
238
+ display.Markdown(response)
239
+
240
+ # List of queries (Replace with any query of your choice)
241
+ other_queries = ["How long were the base and big models trained?",
242
+ "Which optimizer was used when training the models?",
243
+ "What is the position-wise feed-forward neural network mentioned in the paper?",
244
+ "What is the BLEU score of the model in English to German translation (EN-DE)?",
245
+ "How is the scaled-dot-product attention is calculated?",
246
+ ]
247
+
248
+ query = other_queries[0] # Replace with any query from the list above
249
+
250
+ # Generate embeddings for the query
251
+ query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)
252
+
253
+ # Search for the nearest neighbors in the vector database
254
+ distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)
255
+
256
+ # Retrieve the matched items
257
+ matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]
258
+
259
+ # Generate RAG response with Amazon Nova
260
+ response = invoke_nova_multimodal(query, matched_items)
261
+
262
+ # Display the response
263
+ display.Markdown(response)
264
+
265
+
src/pages/categorized/page3.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import tabula
4
+ import pymupdf
5
+ import os
6
+ from tqdm import tqdm
7
+
8
+
9
+ def extract_tables_pymupdf(pdf_path):
10
+ """Extract tables using PyMuPDF (alternative method)"""
11
+ try:
12
+ doc = pymupdf.open(pdf_path)
13
+ all_tables = []
14
+
15
+ for page_num in range(len(doc)):
16
+ page = doc[page_num]
17
+ tables = page.find_tables()
18
+
19
+ for table in tables:
20
+ # Extract table data
21
+ table_data = table.extract()
22
+ if table_data:
23
+ # Convert to DataFrame
24
+ df = pd.DataFrame(table_data[1:], columns=table_data[0])
25
+ all_tables.append({
26
+ 'page': page_num + 1,
27
+ 'dataframe': df
28
+ })
29
+
30
+ doc.close()
31
+ return all_tables
32
+ except Exception as e:
33
+ st.error(f"Error extracting tables with PyMuPDF: {e}")
34
+ return []
35
+
36
+ def main():
37
+ st.title("PDF Table Extractor")
38
+ st.write("Upload a PDF to extract all tables")
39
+
40
+ temp_path = "temp_uploaded.pdf" # Define here
41
+
42
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
43
+
44
+ if uploaded_file is not None:
45
+ # Save uploaded file temporarily
46
+ with open(temp_path, "wb") as f:
47
+ f.write(uploaded_file.getbuffer())
48
+
49
+ # Using PyMuPDF
50
+ tables = extract_tables_pymupdf(temp_path)
51
+
52
+ if tables:
53
+ st.success(f"Found {len(tables)} tables!")
54
+
55
+ for idx, table_info in enumerate(tables):
56
+ st.subheader(f"Table {idx + 1} (Page {table_info['page']})")
57
+ df = table_info['dataframe']
58
+ st.dataframe(df, use_container_width=True)
59
+
60
+ # Clean up temp file
61
+ if os.path.exists(temp_path):
62
+ os.remove(temp_path)
src/pages/categorized/page4.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+
4
+ def main():
5
+ st.write(f'# {Path(__file__).parent.name} - {Path(__file__).name}')
src/pages/categorized/page5.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+
4
+ def main():
5
+ st.write(f'# {Path(__file__).parent.name} - {Path(__file__).name}')
src/pages/categorized/page6.py ADDED
@@ -0,0 +1,671 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import tempfile
5
+ import zipfile
6
+ from io import BytesIO
7
+ import fitz # PyMuPDF
8
+ import cv2
9
+ import numpy as np
10
+
11
+ import streamlit as st
12
+ import pandas as pd
13
+ import requests
14
+ import base64
15
+ from typing import Dict, Any, Optional
16
+ from collections import defaultdict
17
+
18
+ API_KEY = "AIzaSyCD5_sTXRhr4cpBrM08V7UhWNNc1KmaI9I"
19
+ API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
20
+
21
+ SCHEMA = {
22
+ "type": "OBJECT",
23
+ "properties": {
24
+ "material_name": {"type": "STRING"},
25
+ "material_abbreviation": {"type": "STRING"},
26
+ "mechanical_properties": {
27
+ "type": "ARRAY",
28
+ "items": {
29
+ "type": "OBJECT",
30
+ "properties": {
31
+ "section": {"type": "STRING"},
32
+ "property_name": {"type": "STRING"},
33
+ "value": {"type": "STRING"},
34
+ "unit": {"type": "STRING"},
35
+ "english": {"type": "STRING"},
36
+ "test_condition": {"type": "STRING"},
37
+ "comments": {"type": "STRING"}
38
+ },
39
+ "required": ["section", "property_name", "value", "english", "comments"]
40
+ }
41
+ }
42
+ }
43
+ }
44
+
45
+ def make_abbreviation(name: str) -> str:
46
+ """Create a simple abbreviation from the material name."""
47
+ if not name:
48
+ return "UNKNOWN"
49
+ words = name.split()
50
+ abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
51
+ return abbr or name[:6].upper()
52
+
53
+ DPI = 300
54
+ CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
55
+
56
+ def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
57
+ """Calls Gemini API with PDF bytes"""
58
+ try:
59
+ encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
60
+ mime_type = "application/pdf"
61
+ except Exception as e:
62
+ st.error(f"Error encoding PDF: {e}")
63
+ return None
64
+
65
+ prompt = (
66
+ "You are an expert materials scientist. From the attached PDF, extract the material name, "
67
+ "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
68
+ "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
69
+ "For each property, you MUST extract:\n"
70
+ "- property_name\n- value (or range)\n- unit\n"
71
+ "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
72
+ "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
73
+ "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
74
+ )
75
+
76
+ payload = {
77
+ "contents": [{
78
+ "parts": [
79
+ {"text": prompt},
80
+ {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
81
+ ]
82
+ }],
83
+ "generationConfig": {
84
+ "temperature": 0,
85
+ "responseMimeType": "application/json",
86
+ "responseSchema": SCHEMA
87
+ }
88
+ }
89
+
90
+ try:
91
+ r = requests.post(API_URL, json=payload, timeout=300)
92
+ r.raise_for_status()
93
+ data = r.json()
94
+
95
+ candidates = data.get("candidates", [])
96
+ if not candidates:
97
+ return None
98
+
99
+ parts = candidates[0].get("content", {}).get("parts", [])
100
+ json_text = None
101
+ for p in parts:
102
+ t = p.get("text", "")
103
+ if t.strip().startswith("{"):
104
+ json_text = t
105
+ break
106
+
107
+ return json.loads(json_text) if json_text else None
108
+ except Exception as e:
109
+ st.error(f"Gemini API Error: {e}")
110
+ return None
111
+
112
+ def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
113
+ """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
114
+ mat_name = data.get("material_name", "") or ""
115
+ mat_abbr = data.get("material_abbreviation", "") or ""
116
+
117
+ if not mat_abbr:
118
+ mat_abbr = make_abbreviation(mat_name)
119
+
120
+ rows = []
121
+ for item in data.get("mechanical_properties", []):
122
+ rows.append({
123
+ "material_name": mat_name,
124
+ "material_abbreviation": mat_abbr,
125
+ "section": item.get("section", "") or "Mechanical",
126
+ "property_name": item.get("property_name", "") or "Unknown property",
127
+ "value": item.get("value", "") or "N/A",
128
+ "unit": item.get("unit", "") or "",
129
+ "english": item.get("english", "") or "",
130
+ "test_condition": item.get("test_condition", "") or "",
131
+ "comments": item.get("comments", "") or "",
132
+ })
133
+ return pd.DataFrame(rows)
134
+
135
+ # --- IMAGE EXTRACTION LOGIC ---
136
+ def get_page_image(page):
137
+ pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
138
+ img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
139
+ return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
140
+
141
+ def is_valid_plot_geometry(binary_crop):
142
+ h, w = binary_crop.shape
143
+ if h < 100 or w < 100:
144
+ return False
145
+ ink_density = cv2.countNonZero(binary_crop) / (w * h)
146
+ if ink_density > 0.35:
147
+ return False
148
+ h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 4, 1))
149
+ v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 4))
150
+ has_h = cv2.countNonZero(cv2.erode(binary_crop, h_kernel, iterations=1)) > 0
151
+ has_v = cv2.countNonZero(cv2.erode(binary_crop, v_kernel, iterations=1)) > 0
152
+ return has_h or has_v
153
+
154
+ def merge_boxes(rects):
155
+ if not rects:
156
+ return []
157
+ rects = sorted(rects, key=lambda r: r[2] * r[3], reverse=True)
158
+ merged = []
159
+ for r in rects:
160
+ rx, ry, rw, rh = r
161
+ if not any(rx >= m[0]-15 and ry >= m[1]-15 and rx+rw <= m[0]+m[2]+15 and ry+rh <= m[1]+m[3]+15 for m in merged):
162
+ merged.append(r)
163
+ return merged
164
+
165
+ def extract_images(pdf_doc):
166
+ """Extract plot images from PDF using improved logic"""
167
+ grouped_data = defaultdict(lambda: {"page": 0, "image_data": []})
168
+ PADDING = 30
169
+
170
+ for page_num, page in enumerate(pdf_doc, start=1):
171
+ img_bgr = get_page_image(page)
172
+ gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
173
+ _, binary = cv2.threshold(gray, 225, 255, cv2.THRESH_BINARY_INV)
174
+ kernel = np.ones((10, 10), np.uint8)
175
+ dilated = cv2.dilate(binary, kernel, iterations=1)
176
+ contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
177
+
178
+ candidates = []
179
+ page_h, page_w = gray.shape
180
+ for cnt in contours:
181
+ x, y, w, h = cv2.boundingRect(cnt)
182
+ if 0.03 < (w * h) / (page_w * page_h) < 0.8:
183
+ if is_valid_plot_geometry(binary[y:y+h, x:x+w]):
184
+ candidates.append((x, y, w, h))
185
+
186
+ final_rects = merge_boxes(candidates)
187
+ blocks = page.get_text("blocks")
188
+
189
+ for (cx, cy, cw, ch) in final_rects:
190
+ best_caption = f"Figure on Page {page_num} (Unlabeled)"
191
+ min_dist = float('inf')
192
+ for b in blocks:
193
+ text = b[4].strip()
194
+ if CAP_RE.match(text):
195
+ cap_y = b[1] * (DPI/72)
196
+ dist = cap_y - (cy + ch)
197
+ if 0 < dist < (page_h * 0.3) and dist < min_dist:
198
+ best_caption = text.replace('\n', ' ')
199
+ min_dist = dist
200
+
201
+ x1, y1 = max(0, cx - PADDING), max(0, cy - PADDING)
202
+ x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
203
+ crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
204
+
205
+ # Store image data in memory instead of saving to disk
206
+ _, buffer = cv2.imencode('.png', crop)
207
+ img_bytes = buffer.tobytes()
208
+
209
+ fname = f"pg{page_num}_{cx}_{cy}.png"
210
+
211
+ grouped_data[best_caption]["page"] = page_num
212
+ grouped_data[best_caption]["image_data"].append({
213
+ "filename": fname,
214
+ "bytes": img_bytes,
215
+ "array": crop
216
+ })
217
+
218
+ results = [{"caption": k, "page": v["page"], "image_data": v["image_data"]} for k, v in grouped_data.items()]
219
+ return results
220
+
221
+ def create_zip(results, include_json=True):
222
+ """Create a zip file with images and optional JSON"""
223
+ buf = BytesIO()
224
+ with zipfile.ZipFile(buf, "w") as z:
225
+ if include_json:
226
+ json_data = [{"caption": r["caption"], "page": r["page"],
227
+ "image_count": len(r["image_data"])} for r in results]
228
+ z.writestr("plot_data.json", json.dumps(json_data, indent=4))
229
+
230
+ for item in results:
231
+ for img_data in item['image_data']:
232
+ z.writestr(img_data['filename'], img_data['bytes'])
233
+
234
+ buf.seek(0)
235
+ return buf.getvalue()
236
+
237
+ def input_form():
238
+ PROPERTY_CATEGORIES = {
239
+ "Polymer": [
240
+ "Thermal",
241
+ "Mechanical",
242
+ "Processing",
243
+ "Physical",
244
+ "Descriptive",
245
+ ],
246
+ "Fiber": [
247
+ "Mechanical",
248
+ "Physical",
249
+ "Thermal",
250
+ "Descriptive",
251
+ ],
252
+ "Composite": [
253
+ "Mechanical",
254
+ "Thermal",
255
+ "Processing",
256
+ "Physical",
257
+ "Descriptive",
258
+ "Composition / Reinforcement",
259
+ "Architecture / Structure",
260
+ ],
261
+ }
262
+
263
+ PROPERTY_NAMES = {
264
+ "Polymer": {
265
+ "Thermal": [
266
+ "Glass transition temperature (Tg)",
267
+ "Melting temperature (Tm)",
268
+ "Crystallization temperature (Tc)",
269
+ "Degree of crystallinity",
270
+ "Decomposition temperature",
271
+ ],
272
+ "Mechanical": [
273
+ "Tensile modulus",
274
+ "Tensile strength",
275
+ "Elongation at break",
276
+ "Flexural modulus",
277
+ "Impact strength",
278
+ ],
279
+ "Processing": [
280
+ "Melt flow index (MFI)",
281
+ "Processing temperature",
282
+ "Cooling rate",
283
+ "Mold shrinkage",
284
+ ],
285
+ "Physical": [
286
+ "Density",
287
+ "Specific gravity",
288
+ ],
289
+ "Descriptive": [
290
+ "Material grade",
291
+ "Manufacturer",
292
+ ],
293
+ },
294
+
295
+ "Fiber": {
296
+ "Mechanical": [
297
+ "Tensile modulus",
298
+ "Tensile strength",
299
+ "Strain to failure",
300
+ ],
301
+ "Physical": [
302
+ "Density",
303
+ "Fiber diameter",
304
+ ],
305
+ "Thermal": [
306
+ "Decomposition temperature",
307
+ ],
308
+ "Descriptive": [
309
+ "Fiber type",
310
+ "Surface treatment",
311
+ ],
312
+ },
313
+
314
+ "Composite": {
315
+ "Mechanical": [
316
+ "Longitudinal modulus (E1)",
317
+ "Transverse modulus (E2)",
318
+ "Shear modulus (G12)",
319
+ "Poissons ratio (V12)",
320
+ "Tensile strength (fiber direction)",
321
+ "Interlaminar shear strength",
322
+ ],
323
+ "Thermal": [
324
+ "Glass transition temperature (matrix)",
325
+ "Coefficient of thermal expansion (CTE)",
326
+ ],
327
+ "Processing": [
328
+ "Curing temperature",
329
+ "Curing pressure",
330
+ ],
331
+ "Physical": [
332
+ "Density",
333
+ ],
334
+ "Descriptive": [
335
+ "Laminate type",
336
+ ],
337
+ "Composition / Reinforcement": [
338
+ "Fiber volume fraction",
339
+ "Fiber weight fraction",
340
+ "Fiber type",
341
+ "Matrix type",
342
+ ],
343
+ "Architecture / Structure": [
344
+ "Weave type",
345
+ "Ply orientation",
346
+ "Number of plies",
347
+ "Stacking sequence",
348
+ ],
349
+ },
350
+ }
351
+
352
+ st.title("Materials Property Input Form")
353
+
354
+ material_class = st.selectbox(
355
+ "Select Material Class",
356
+ ("Polymer", "Fiber", "Composite"),
357
+ index=None,
358
+ placeholder="Choose material class",
359
+ )
360
+
361
+ if material_class:
362
+ property_category = st.selectbox(
363
+ "Select Property Category",
364
+ PROPERTY_CATEGORIES[material_class],
365
+ index=None,
366
+ placeholder="Choose property category",
367
+ )
368
+ else:
369
+ property_category = None
370
+
371
+ if material_class and property_category:
372
+ property_name = st.selectbox(
373
+ "Select Property",
374
+ PROPERTY_NAMES[material_class][property_category],
375
+ index=None,
376
+ placeholder="Choose property",
377
+ )
378
+ else:
379
+ property_name = None
380
+
381
+ if material_class and property_category and property_name:
382
+ with st.form("user_input"):
383
+ st.subheader("Enter Data")
384
+
385
+ material_name = st.text_input("Material Name")
386
+ material_abbr = st.text_input("Material Abbreviation")
387
+
388
+ value = st.text_input("Value")
389
+ unit = st.text_input("Unit (SI)")
390
+ english = st.text_input("English Units")
391
+ test_condition = st.text_input("Test Condition")
392
+ comments = st.text_area("Comments")
393
+
394
+ submitted = st.form_submit_button("Submit")
395
+
396
+ if submitted:
397
+ if not (material_name and value):
398
+ st.error("Material name and value are required.")
399
+
400
+ else:
401
+ Input_db = pd.DataFrame([{
402
+ "material_class": material_class,
403
+ "material_name": material_name,
404
+ "material_abbreviation": material_abbr,
405
+ "section": property_category,
406
+ "property_name": property_name,
407
+ "value": value,
408
+ "unit": unit,
409
+ "english_units": english,
410
+ "test_condition": test_condition,
411
+ "comments": comments
412
+ }])
413
+
414
+ st.success("Property added successfully")
415
+ st.dataframe(Input_db)
416
+
417
+ if "user_uploaded_data" not in st.session_state:
418
+ st.session_state["user_uploaded_data"] = Input_db
419
+ return
420
+ else:
421
+ st.session_state["user_uploaded_data"] = pd.concat(
422
+ [st.session_state["user_uploaded_data"], Input_db],
423
+ ignore_index=True
424
+ )
425
+
426
+ return
427
+
428
+ def main():
429
+ st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
430
+
431
+
432
+ if 'image_results' not in st.session_state:
433
+ st.session_state.image_results = []
434
+ if 'pdf_processed' not in st.session_state:
435
+ st.session_state.pdf_processed = False
436
+ if 'current_pdf_name' not in st.session_state:
437
+ st.session_state.current_pdf_name = None
438
+ if 'form_submitted' not in st.session_state:
439
+ st.session_state.form_submitted = False
440
+ if 'pdf_data_extracted' not in st.session_state:
441
+ st.session_state.pdf_data_extracted = False
442
+ if 'pdf_extracted_df' not in st.session_state:
443
+ st.session_state.pdf_extracted_df = pd.DataFrame()
444
+
445
+
446
+ prev_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
447
+ input_form()
448
+ curr_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
449
+
450
+ if curr_uploaded_count > prev_uploaded_count:
451
+ st.session_state.form_submitted = True
452
+
453
+ st.title("PDF Material Data & Plot Extractor")
454
+
455
+ uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
456
+
457
+ if not uploaded_file:
458
+
459
+ st.info("Upload a PDF to extract material data and plots")
460
+ st.session_state.pdf_processed = False
461
+ st.session_state.current_pdf_name = None
462
+ st.session_state.image_results = []
463
+ st.session_state.form_submitted = False
464
+ st.session_state.pdf_data_extracted = False
465
+ st.session_state.pdf_extracted_df = pd.DataFrame()
466
+ return
467
+
468
+
469
+ paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
470
+
471
+ if st.session_state.current_pdf_name != uploaded_file.name:
472
+ st.session_state.pdf_processed = False
473
+ st.session_state.current_pdf_name = uploaded_file.name
474
+ st.session_state.image_results = []
475
+ st.session_state.form_submitted = False
476
+
477
+ if st.session_state.form_submitted:
478
+ st.session_state.form_submitted = False
479
+ st.info("A Form was submitted. But your previous extracted data has been added already. If you want to extract more data/plots" \
480
+ "upload again")
481
+ tab1, tab2 = st.tabs(["Material Data", "Extracted Plots"])
482
+ with tab1:
483
+ st.info("Material data from form has been added to database.")
484
+ with tab2:
485
+ st.info("Plots already extracted")
486
+ return
487
+
488
+ tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
489
+
490
+ with tempfile.TemporaryDirectory() as tmpdir:
491
+ pdf_path = os.path.join(tmpdir, uploaded_file.name)
492
+ with open(pdf_path, "wb") as f:
493
+ f.write(uploaded_file.getbuffer())
494
+
495
+ with tab1:
496
+ st.subheader("Material Properties Data")
497
+
498
+ # Only call Gemini once per PDF
499
+ if not st.session_state.pdf_data_extracted:
500
+ with st.spinner(" Extracting material data..."):
501
+ with open(pdf_path, "rb") as f:
502
+ pdf_bytes = f.read()
503
+
504
+ data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
505
+
506
+ if data:
507
+ df = convert_to_dataframe(data)
508
+ if not df.empty:
509
+ st.session_state.pdf_extracted_df = df
510
+ st.session_state.pdf_data_extracted = True
511
+ st.session_state.pdf_extracted_meta = data # optional: keep raw meta
512
+ else:
513
+ st.warning("No data extracted")
514
+ else:
515
+ st.error("Failed to extract data from PDF")
516
+ # After extraction, or when rerunning, use stored data
517
+ df = st.session_state.pdf_extracted_df
518
+
519
+ if not df.empty:
520
+ data = st.session_state.get("pdf_extracted_meta", {})
521
+ st.success(f" Extracted {len(df)} properties")
522
+
523
+ col1, col2 = st.columns(2)
524
+ with col1:
525
+ st.metric("Material", data.get("material_name", "N/A"))
526
+ with col2:
527
+ st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
528
+
529
+ st.dataframe(df, use_container_width=True, height=400)
530
+ st.subheader("Assign Material Category")
531
+
532
+ extracted_material_class = st.selectbox(
533
+ "Select category for this material",
534
+ ["Polymer", "Fiber", "Composite"],
535
+ index=None,
536
+ placeholder="Required before adding to database"
537
+ )
538
+ if st.button(" Add to Database"):
539
+ if not extracted_material_class:
540
+ st.error("Please select a material category before adding.")
541
+ else:
542
+ df["material_class"] = extracted_material_class
543
+ # Optional: add material_type for Page 1 filtering
544
+ df["material_type"] = extracted_material_class
545
+
546
+ if "user_uploaded_data" not in st.session_state:
547
+ st.session_state["user_uploaded_data"] = df
548
+ else:
549
+ st.session_state["user_uploaded_data"] = pd.concat(
550
+ [st.session_state["user_uploaded_data"], df],
551
+ ignore_index=True
552
+ )
553
+
554
+ st.success(f"Added to {extracted_material_class} database!")
555
+
556
+ csv = df.to_csv(index=False)
557
+ st.download_button(
558
+ "⬇ Download CSV",
559
+ data=csv,
560
+ file_name=f"{paper_id}_data.csv",
561
+ mime="text/csv"
562
+ )
563
+
564
+
565
+ with tab2:
566
+ st.subheader("Extracted Plot Images")
567
+
568
+ if not st.session_state.pdf_processed:
569
+ with st.spinner(" Extracting plots from PDF..."):
570
+ doc = fitz.open(pdf_path)
571
+ st.session_state.image_results = extract_images(doc)
572
+ doc.close()
573
+ st.session_state.pdf_processed = True
574
+
575
+ if st.session_state.image_results:
576
+ subtab1, subtab2 = st.tabs([" Images", " JSON Preview"])
577
+
578
+ with subtab1:
579
+ st.success(f" Extracted {len(st.session_state.image_results)} plots")
580
+
581
+ col_img, col_json, col_all = st.columns(3)
582
+
583
+ with col_img:
584
+ img_zip = create_zip(st.session_state.image_results, include_json=False)
585
+ st.download_button(
586
+ " Download Images Only",
587
+ data=img_zip,
588
+ file_name=f"{paper_id}_images.zip",
589
+ mime="application/zip",
590
+ use_container_width=True,
591
+ key="download_images"
592
+ )
593
+
594
+ with col_json:
595
+ json_data = [{"caption": r["caption"], "page": r["page"],
596
+ "image_count": len(r["image_data"])} for r in st.session_state.image_results]
597
+ st.download_button(
598
+ " Download JSON",
599
+ data=json.dumps(json_data, indent=4),
600
+ file_name=f"{paper_id}_metadata.json",
601
+ mime="application/json",
602
+ use_container_width=True,
603
+ key="download_json_top"
604
+ )
605
+
606
+ with col_all:
607
+ full_zip = create_zip(st.session_state.image_results, include_json=True)
608
+ st.download_button(
609
+ " Download All",
610
+ data=full_zip,
611
+ file_name=f"{paper_id}_complete.zip",
612
+ mime="application/zip",
613
+ use_container_width=True,
614
+ key="download_all"
615
+ )
616
+
617
+ st.divider()
618
+
619
+ results_copy = st.session_state.image_results.copy()
620
+
621
+ for idx in range(len(results_copy)):
622
+ if idx >= len(st.session_state.image_results):
623
+ break
624
+
625
+ r = st.session_state.image_results[idx]
626
+
627
+ with st.container(border=True):
628
+ col_cap, col_btn = st.columns([0.85, 0.15])
629
+ col_cap.markdown(f"**Page {r['page']}** {r['caption']}")
630
+
631
+ if col_btn.button(" Delete", key=f"del_g_{idx}_{r['page']}"):
632
+ del st.session_state.image_results[idx]
633
+ st.rerun()
634
+
635
+ image_data_list = r['image_data']
636
+ if image_data_list and len(image_data_list) > 0:
637
+ cols = st.columns(len(image_data_list))
638
+ for p_idx in range(len(image_data_list)):
639
+ if p_idx >= len(st.session_state.image_results[idx]['image_data']):
640
+ break
641
+
642
+ img_data = st.session_state.image_results[idx]['image_data'][p_idx]
643
+ with cols[p_idx]:
644
+ st.image(img_data['array'], width=img_width, channels="BGR")
645
+ if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
646
+ del st.session_state.image_results[idx]['image_data'][p_idx]
647
+ if len(st.session_state.image_results[idx]['image_data']) == 0:
648
+ del st.session_state.image_results[idx]
649
+ st.rerun()
650
+
651
+ with subtab2:
652
+ st.subheader("Metadata Preview")
653
+ json_data = [{"caption": r["caption"], "page": r["page"],
654
+ "image_count": len(r["image_data"]),
655
+ "images": [img["filename"] for img in r["image_data"]]}
656
+ for r in st.session_state.image_results]
657
+
658
+ st.download_button(
659
+ " Download JSON",
660
+ data=json.dumps(json_data, indent=4),
661
+ file_name=f"{paper_id}_metadata.json",
662
+ mime="application/json",
663
+ key="download_json_bottom"
664
+ )
665
+
666
+ st.json(json_data)
667
+ else:
668
+ st.warning("No plots found in PDF")
669
+
670
+ if __name__ == "__main__":
671
+ main()
src/pages/categorized/propgraph.jpg ADDED