Spaces:

aim4composites
/

AIM_InDes_Tool

Running

App Files Files Community

gvlktejaswi commited on 26 days ago

Commit

bad95eb

verified ·

1 Parent(s): b443b15

Upload 24 files

Browse files

Files changed (25) hide show

.gitattributes +1 -0
src/pages/3_Categorized_Search.py +34 -0
src/pages/5_Upload_Data.py +18 -0
src/pages/categorized/Backend/Pdf_DataExtraction.py +120 -0
src/pages/categorized/Backend/Pdf_ImageExtraction.py +390 -0
src/pages/categorized/ESS-min.jpg +3 -0
src/pages/categorized/Temp_Backup.py +736 -0
src/pages/categorized/__pycache__/page1.cpython-312.pyc +0 -0
src/pages/categorized/__pycache__/page1.cpython-313.pyc +0 -0
src/pages/categorized/__pycache__/page1.cpython-314.pyc +0 -0
src/pages/categorized/__pycache__/page2.cpython-312.pyc +0 -0
src/pages/categorized/__pycache__/page2.cpython-313.pyc +0 -0
src/pages/categorized/__pycache__/page2.cpython-314.pyc +0 -0
src/pages/categorized/__pycache__/page3.cpython-313.pyc +0 -0
src/pages/categorized/__pycache__/page3.cpython-314.pyc +0 -0
src/pages/categorized/__pycache__/page6.cpython-314.pyc +0 -0
src/pages/categorized/__pycache__/page6.cpython-314.pyc.2029864538672 +0 -0
src/pages/categorized/__pycache__/page6.cpython-314.pyc.2097035857760 +0 -0
src/pages/categorized/page1.py +307 -0
src/pages/categorized/page2.py +265 -0
src/pages/categorized/page3.py +62 -0
src/pages/categorized/page4.py +5 -0
src/pages/categorized/page5.py +5 -0
src/pages/categorized/page6.py +671 -0
src/pages/categorized/propgraph.jpg +0 -0

.gitattributes CHANGED Viewed

@@ -40,3 +40,4 @@ src/images/images/Epoxy[[:space:]]+[[:space:]]44%[[:space:]]Carbon[[:space:]]fib
 src/images/images/Home.png filter=lfs diff=lfs merge=lfs -text
 src/images/images/logo.png filter=lfs diff=lfs merge=lfs -text
 src/images/images/us_deptenergy.jpg filter=lfs diff=lfs merge=lfs -text

 src/images/images/Home.png filter=lfs diff=lfs merge=lfs -text
 src/images/images/logo.png filter=lfs diff=lfs merge=lfs -text
 src/images/images/us_deptenergy.jpg filter=lfs diff=lfs merge=lfs -text
+src/pages/categorized/ESS-min.jpg filter=lfs diff=lfs merge=lfs -text

src/pages/3_Categorized_Search.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+from PIL import Image # Used to open and handle image files
+def load_page1():
+    from pages.categorized.page1 import main
+    main()
+# def load_page2():
+#     from pages.categorized.page2 import main
+#     main()
+load_page1()
+#st.sidebar.button('Material Type', on_click=load_page1)
+#st.sidebar.button('Trade Name', on_click=load_page2)
+#st.sidebar.button('Manufacturer Name', on_click=load_page3)
+#image = Image.open('logo.png')
+#st.image(image, caption='a', use_container_width=True)
+st.sidebar.write("")
+st.sidebar.write("")
+st.sidebar.write("")
+st.sidebar.write("")
+st.sidebar.write("")
+st.sidebar.write("")
+st.sidebar.write("")
+st.sidebar.write("")
+st.sidebar.image("logo.png", caption=" ", width=150)

src/pages/5_Upload_Data.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import streamlit as st
+from PIL import Image
+# def load_page1():
+#     from pages.categorized.page1 import main
+#     main()
+def load_page6():
+    from pages.categorized.page6 import main
+    main()
+def load_page3():
+    from pages.categorized.page3 import main
+    main()
+load_page6()
+#load_page3()

src/pages/categorized/Backend/Pdf_DataExtraction.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import streamlit as st
+import pandas as pd
+from PIL import Image
+import requests
+import base64
+import json
+import os
+from typing import Dict, Any, Optional
+# Backend PDF extraction Logic
+API_KEY = "AIzaSyAruLR2WyiaL9PquOXOhHF4wMn7tfYZWek"
+API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
+SCHEMA = {
+    "type": "OBJECT",
+    "properties": {
+        "material_name": {"type": "STRING"},
+        "material_abbreviation": {"type": "STRING"},
+        "mechanical_properties": {
+            "type": "ARRAY",
+            "items": {
+                "type": "OBJECT",
+                "properties": {
+                    "section": {"type": "STRING"},
+                    "property_name": {"type": "STRING"},
+                    "value": {"type": "STRING"},
+                    "unit": {"type": "STRING"},
+                    "english": {"type": "STRING"},
+                    "test_condition": {"type": "STRING"},
+                    "comments": {"type": "STRING"}
+                },
+                "required": ["section", "property_name", "value", "english", "comments"]
+            }
+        }
+    }
+}
+# === GEMINI CALL FUNCTION ===
+def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
+    """Calls Gemini API with PDF bytes"""
+    try:
+        encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
+        mime_type = "application/pdf"
+    except Exception as e:
+        st.error(f"Error encoding PDF: {e}")
+        return None
+    prompt = (
+         "Extract all experimental data from this research paper. "
+         "For each measurement, extract: "
+         "- experiment_name, measured_value, unit, uncertainty, method, conditions. "
+         "Return as JSON."
+        # "You are an expert materials scientist. From the attached PDF, extract the material name, "
+        # "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
+        # "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
+        # "For each property, you MUST extract:\n"
+        # "- property_name\n- value (or range)\n- unit\n"
+        # "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
+        # "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
+        # "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
+    )
+    payload = {
+        "contents": [
+            {
+                "parts": [
+                    {"text": prompt},
+                    {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
+                ]
+            }
+        ],
+        "generationConfig": {
+            "temperature": 0,
+            "responseMimeType": "application/json",
+            "responseSchema": SCHEMA
+        }
+    }
+    try:
+        r = requests.post(API_URL, json=payload, timeout=300)
+        r.raise_for_status()
+        data = r.json()
+        candidates = data.get("candidates", [])
+        if not candidates:
+            return None
+        parts = candidates[0].get("content", {}).get("parts", [])
+        json_text = None
+        for p in parts:
+            t = p.get("text", "")
+            if t.strip().startswith("{"):
+                json_text = t
+                break
+        return json.loads(json_text) if json_text else None
+    except Exception as e:
+        st.error(f"Gemini API Error: {e}")
+        return None
+def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
+    """Convert extracted JSON to DataFrame"""
+    rows = []
+    for item in data.get("mechanical_properties", []):
+        rows.append({
+            "material_name": data.get("material_name", ""),
+            "material_abbreviation": data.get("material_abbreviation", ""),
+            "section": item.get("section", ""),
+            "property_name": item.get("property_name", ""),
+            "value": item.get("value", ""),
+            "unit": item.get("unit", ""),
+            "english": item.get("english", ""),
+            "test_condition": item.get("test_condition", ""),
+            "comments": item.get("comments", "")
+        })
+    return pd.DataFrame(rows)

src/pages/categorized/Backend/Pdf_ImageExtraction.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import os
+import re
+import json
+import math
+import tempfile
+import fitz  # PyMuPDF
+import cv2
+import numpy as np
+from PIL import Image
+import streamlit as st
+# -------------------
+# Config
+# -------------------
+DPI = 300
+OUT_DIR = "outputs"
+KEEP_ONLY_STRESS_STRAIN = False
+CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
+SS_KW  = re.compile(
+    r"(stress\s*[-–]?\s*strain|stress|strain|tensile|MPa|GPa|kN|yield|elongation)",
+    re.IGNORECASE
+)
+# -------------------
+# Render helpers
+# -------------------
+def render_page(page, dpi=DPI):
+    mat = fitz.Matrix(dpi/72, dpi/72)
+    pix = page.get_pixmap(matrix=mat, alpha=False)
+    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    return img, mat
+def pdf_to_px_bbox(bbox_pdf, mat):
+    x0, y0, x1, y1 = bbox_pdf
+    sx, sy = mat.a, mat.d
+    return (int(float(x0) * sx), int(float(y0) * sy), int(float(x1) * sx), int(float(y1) * sy))
+def safe_crop_px(pil_img, box):
+    if not isinstance(box, (tuple, list)):
+        return None
+    if len(box) == 1 and isinstance(box[0], (tuple, list)) and len(box[0]) == 4:
+        box = box[0]
+    if len(box) != 4:
+        return None
+    x0, y0, x1, y1 = box
+    if any(isinstance(v, (tuple, list)) for v in (x0, y0, x1, y1)):
+        return None
+    try:
+        x0 = int(x0)
+        y0 = int(y0)
+        x1 = int(x1)
+        y1 = int(y1)
+    except (TypeError, ValueError):
+        return None
+    if x1 < x0:
+        x0, x1 = x1, x0
+    if y1 < y0:
+        y0, y1 = y1, y0
+    W, H = pil_img.size
+    x0 = max(0, min(W, x0))
+    x1 = max(0, min(W, x1))
+    y0 = max(0, min(H, y0))
+    y1 = max(0, min(H, y1))
+    if x1 <= x0 or y1 <= y0:
+        return None
+    return pil_img.crop((x0, y0, x1, y1))
+# -------------------
+# Captions
+# -------------------
+def find_caption_blocks(page):
+    caps = []
+    blocks = page.get_text("blocks")
+    for b in blocks:
+        x0, y0, x1, y1, text = b[0], b[1], b[2], b[3], b[4]
+        t = " ".join(str(text).strip().split())
+        if CAP_RE.match(t):
+            caps.append({"bbox": (x0, y0, x1, y1), "text": t})
+    return caps
+# -------------------
+# Dedupe: dHash
+# -------------------
+def dhash64(pil_img):
+    gray = pil_img.convert("L").resize((9, 8), Image.LANCZOS)
+    pixels = list(gray.getdata())
+    bits = 0
+    for r in range(8):
+        for c in range(8):
+            left = pixels[r * 9 + c]
+            right = pixels[r * 9 + c + 1]
+            bits = (bits << 1) | (1 if left > right else 0)
+    return bits
+# -------------------
+# Rejectors
+# -------------------
+def has_colorbar_like_strip(pil_img):
+    img = np.array(pil_img)
+    if img.ndim != 3:
+        return False
+    H, W, _ = img.shape
+    if W < 250 or H < 150:
+        return False
+    strip_w = max(18, int(0.07 * W))
+    strip = img[:, W-strip_w:W, :]
+    q = (strip // 24).reshape(-1, 3)
+    uniq = np.unique(q, axis=0)
+    return len(uniq) > 70
+def texture_score(pil_img):
+    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
+    lap = cv2.Laplacian(gray, cv2.CV_64F)
+    return float(lap.var())
+def is_mostly_legend(pil_img):
+    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
+    bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    bw = cv2.medianBlur(bw, 3)
+    H, W = bw.shape
+    fill = float(np.count_nonzero(bw)) / float(H * W)
+    return (0.03 < fill < 0.18) and (min(H, W) < 260)
+# -------------------
+# Plot detection
+# -------------------
+def detect_axes_lines(pil_img):
+    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
+    edges = cv2.Canny(gray, 50, 150)
+    H, W = gray.shape
+    min_len = int(0.28 * min(H, W))
+    lines = cv2.HoughLinesP(
+        edges, 1, np.pi/180,
+        threshold=90,
+        minLineLength=min_len,
+        maxLineGap=14
+    )
+    if lines is None:
+        return None, None
+    horizontals, verticals = [], []
+    for x1, y1, x2, y2 in lines[:, 0]:
+        dx, dy = abs(x2-x1), abs(y2-y1)
+        length = math.hypot(dx, dy)
+        if dy < 18 and dx > 0.35 * W:
+            horizontals.append((length, (x1, y1, x2, y2)))
+        if dx < 18 and dy > 0.35 * H:
+            verticals.append((length, (x1, y1, x2, y2)))
+    if not horizontals or not verticals:
+        return None, None
+    horizontals.sort(key=lambda t: t[0], reverse=True)
+    verticals.sort(key=lambda t: t[0], reverse=True)
+    return horizontals[0][1], verticals[0][1]
+def axis_intersection_ok(x_axis, y_axis, W, H):
+    xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
+    ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
+    if not (0 <= xa_y < H and 0 <= ya_x < W):
+        return False
+    if ya_x > int(0.95 * W) or xa_y < int(0.05 * H):
+        return False
+    return True
+def tick_text_presence_score(pil_img, x_axis, y_axis):
+    img = np.array(pil_img)
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    bw = cv2.medianBlur(bw, 3)
+    H, W = gray.shape
+    xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
+    ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
+    y0a = max(0, xa_y - 40)
+    y1a = min(H, xa_y + 110)
+    x_roi = bw[y0a:y1a, 0:W]
+    x0b = max(0, ya_x - 180)
+    x1b = min(W, ya_x + 50)
+    y_roi = bw[0:H, x0b:x1b]
+    def count_small_components(mask):
+        num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
+        cnt = 0
+        for i in range(1, num):
+            x, y, w, h, area = stats[i]
+            if 4 <= w <= 150 and 4 <= h <= 150 and 20 <= area <= 5000:
+                cnt += 1
+        return cnt
+    return count_small_components(x_roi) + count_small_components(y_roi)
+def is_real_plot(pil_img):
+    if has_colorbar_like_strip(pil_img):
+        return False
+    if is_mostly_legend(pil_img):
+        return False
+    x_axis, y_axis = detect_axes_lines(pil_img)
+    if x_axis is None or y_axis is None:
+        return False
+    arr = np.array(pil_img)
+    H, W = arr.shape[0], arr.shape[1]
+    if not axis_intersection_ok(x_axis, y_axis, W, H):
+        return False
+    if texture_score(pil_img) > 2200:
+        return False
+    score = tick_text_presence_score(pil_img, x_axis, y_axis)
+    return score >= 18
+# -------------------
+# Candidate boxes in a region
+# -------------------
+def connected_components_boxes(pil_img):
+    img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    mask = (gray < 245).astype(np.uint8) * 255
+    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((7, 7), np.uint8), iterations=2)
+    num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
+    boxes = []
+    for i in range(1, num):
+        x, y, w, h, area = stats[i]
+        boxes.append((int(area), (int(x), int(y), int(x + w), int(y + h))))
+    boxes.sort(key=lambda t: t[0], reverse=True)
+    return boxes
+def expand_box(box, W, H, left=0.10, right=0.06, top=0.06, bottom=0.18):
+    x0, y0, x1, y1 = box
+    bw = x1 - x0
+    bh = y1 - y0
+    ex0 = max(0, int(x0 - left * bw))
+    ex1 = min(W, int(x1 + right * bw))
+    ey0 = max(0, int(y0 - top * bh))
+    ey1 = min(H, int(y1 + bottom * bh))
+    return (ex0, ey0, ex1, ey1)
+# -------------------
+# Crop plot from caption
+# -------------------
+def crop_plot_from_caption(page_img, cap_bbox_pdf, mat):
+    cap_px = pdf_to_px_bbox(cap_bbox_pdf, mat)
+    cap_y0 = cap_px[1]
+    cap_y1 = cap_px[3]
+    W, H = page_img.size
+    search_top = max(0, cap_y0 - int(0.95 * H))
+    search_bot = min(H, cap_y1 + int(0.20 * H))
+    region = safe_crop_px(page_img, (0, search_top, W, search_bot))
+    if region is None:
+        return None
+    comps = connected_components_boxes(region)
+    best = None
+    best_area = -1
+    for area, box in comps[:35]:
+        x0, y0, x1, y1 = box
+        bw = x1 - x0
+        bh = y1 - y0
+        if bw < 220 or bh < 180:
+            continue
+        exp = expand_box(box, region.size[0], region.size[1])
+        cand = safe_crop_px(region, exp)
+        if cand is None:
+            continue
+        if not is_real_plot(cand):
+            continue
+        if area > best_area:
+            best_area = area
+            best = cand
+    return best
+# -------------------
+# Streamlit UI
+# -------------------
+def run_extraction(pdf_path, paper_id="uploaded_paper"):
+    out_paper = os.path.join(OUT_DIR, paper_id)
+    out_imgs = os.path.join(out_paper, "plots_with_axes")
+    os.makedirs(out_imgs, exist_ok=True)
+    doc = fitz.open(pdf_path)
+    results = []
+    seen = set()
+    saved = 0
+    for p in range(len(doc)):
+        page = doc[p]
+        caps = find_caption_blocks(page)
+        if not caps:
+            continue
+        page_img, mat = render_page(page, dpi=DPI)
+        for cap in caps:
+            cap_text = cap["text"]
+            if KEEP_ONLY_STRESS_STRAIN and not SS_KW.search(cap_text):
+                continue
+            fig = crop_plot_from_caption(page_img, cap["bbox"], mat)
+            if fig is None:
+                continue
+            if fig.size[0] > 8 and fig.size[1] > 8:
+                fig = fig.crop((2, 2, fig.size[0]-2, fig.size[1]-2))
+            try:
+                h = dhash64(fig)
+            except Exception:
+                continue
+            if h in seen:
+                continue
+            seen.add(h)
+            img_name = f"p{p+1:02d}_{saved:04d}.png"
+            img_path = os.path.join(out_imgs, img_name)
+            fig.save(img_path)
+            results.append({
+                "page": p + 1,
+                "caption": cap_text,
+                "image": img_path
+            })
+            saved += 1
+    out_json = os.path.join(out_paper, "plots_with_axes.json")
+    with open(out_json, "w", encoding="utf-8") as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    return results, out_json
+def main():
+    st.set_page_config(page_title="Research Paper Plot Extractor", layout="wide")
+    st.title(" Plot Extractor (Upload PDF)")
+    uploaded = st.file_uploader("Upload a research paper PDF", type=["pdf"])
+    if not uploaded:
+        st.info("Upload a PDF to extract plots.")
+        return
+    paper_id = os.path.splitext(uploaded.name)[0].replace(" ", "_")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdf_path = os.path.join(tmpdir, uploaded.name)
+        with open(pdf_path, "wb") as f:
+            f.write(uploaded.read())
+        with st.spinner("Extracting plots..."):
+            results, out_json = run_extraction(pdf_path, paper_id=paper_id)
+        st.success(f"Extracted {len(results)} plots.")
+        # Show images + captions
+        for r in results:
+            st.markdown(f"**Page {r['page']}** — {r['caption']}")
+            st.image(r["image"], use_container_width=True)
+            st.divider()
+        # JSON viewer + download
+        st.subheader("JSON Output")
+        st.json(results)
+        with open(out_json, "rb") as f:
+            st.download_button(
+                "Download JSON",
+                data=f,
+                file_name=os.path.basename(out_json),
+                mime="application/json"
+            )
+if __name__ == "__main__":
+    main()

src/pages/categorized/ESS-min.jpg ADDED Viewed

Git LFS Details

SHA256: ff58c9304c39dc90ca15b516a1f1ec385ea60a9829c5dd9eb698ee1f82778eb7
Pointer size: 131 Bytes
Size of remote file: 356 kB

src/pages/categorized/Temp_Backup.py ADDED Viewed

	@@ -0,0 +1,736 @@

+import os
+import re
+import json
+import math
+import tempfile
+import fitz  # PyMuPDF
+import cv2
+import numpy as np
+from PIL import Image
+import streamlit as st
+import pandas as pd
+import requests
+import base64
+from typing import Dict, Any, Optional
+API_KEY = "AIzaSyAruLR2WyiaL9PquOXOhHF4wMn7tfYZWek"
+API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
+SCHEMA = {
+    "type": "OBJECT",
+    "properties": {
+        "material_name": {"type": "STRING"},
+        "material_abbreviation": {"type": "STRING"},
+        "mechanical_properties": {
+            "type": "ARRAY",
+            "items": {
+                "type": "OBJECT",
+                "properties": {
+                    "section": {"type": "STRING"},
+                    "property_name": {"type": "STRING"},
+                    "value": {"type": "STRING"},
+                    "unit": {"type": "STRING"},
+                    "english": {"type": "STRING"},
+                    "test_condition": {"type": "STRING"},
+                    "comments": {"type": "STRING"}
+                },
+                "required": ["section", "property_name", "value", "english", "comments"]
+            }
+        }
+    }
+}
+def make_abbreviation(name: str) -> str:
+    """Create a simple abbreviation from the material name."""
+    if not name:
+        return "UNKNOWN"
+    words = name.split()
+    abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
+    return abbr or name[:6].upper()
+DPI = 300
+OUT_DIR = "outputs"
+KEEP_ONLY_STRESS_STRAIN = False
+CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
+SS_KW = re.compile(
+    r"(stress\s*[-–]?\s*strain|stress|strain|tensile|MPa|GPa|kN|yield|elongation)",
+    re.IGNORECASE
+)
+def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
+    """Calls Gemini API with PDF bytes"""
+    try:
+        encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
+        mime_type = "application/pdf"
+    except Exception as e:
+        st.error(f"Error encoding PDF: {e}")
+        return None
+    prompt = (
+        "You are an expert materials scientist. From the attached PDF, extract the material name, "
+        "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
+        "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
+        "For each property, you MUST extract:\n"
+        "- property_name\n- value (or range)\n- unit\n"
+        "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
+        "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
+        "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
+    )
+    payload = {
+        "contents": [{
+            "parts": [
+                {"text": prompt},
+                {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
+            ]
+        }],
+        "generationConfig": {
+            "temperature": 0,
+            "responseMimeType": "application/json",
+            "responseSchema": SCHEMA
+        }
+    }
+    try:
+        r = requests.post(API_URL, json=payload, timeout=300)
+        r.raise_for_status()
+        data = r.json()
+        candidates = data.get("candidates", [])
+        if not candidates:
+            return None
+        parts = candidates[0].get("content", {}).get("parts", [])
+        json_text = None
+        for p in parts:
+            t = p.get("text", "")
+            if t.strip().startswith("{"):
+                json_text = t
+                break
+        return json.loads(json_text) if json_text else None
+    except Exception as e:
+        st.error(f"Gemini API Error: {e}")
+        return None
+# def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
+#     """Convert extracted JSON to DataFrame"""
+#     rows = []
+#     for item in data.get("mechanical_properties", []):
+#         rows.append({
+#             "material_name": data.get("material_name", ""),
+#             "material_abbreviation": data.get("material_abbreviation", ""),
+#             "section": item.get("section", ""),
+#             "property_name": item.get("property_name", ""),
+#             "value": item.get("value", ""),
+#             "unit": item.get("unit", ""),
+#             "english": item.get("english", ""),
+#             "test_condition": item.get("test_condition", ""),
+#             "comments": item.get("comments", "")
+#         })
+#     return pd.DataFrame(rows)
+def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
+    """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
+    mat_name = data.get("material_name", "") or ""
+    mat_abbr = data.get("material_abbreviation", "") or ""
+    if not mat_abbr:
+        mat_abbr = make_abbreviation(mat_name)
+    rows = []
+    for item in data.get("mechanical_properties", []):
+        rows.append({
+            "material_name": mat_name,
+            "material_abbreviation": mat_abbr,
+            "section": item.get("section", "") or "Mechanical",
+            "property_name": item.get("property_name", "") or "Unknown property",
+            "value": item.get("value", "") or "N/A",
+            "unit": item.get("unit", "") or "",
+            "english": item.get("english", "") or "",
+            "test_condition": item.get("test_condition", "") or "",
+            "comments": item.get("comments", "") or "",
+        })
+    return pd.DataFrame(rows)
+def render_page(page, dpi=DPI):
+    mat = fitz.Matrix(dpi/72, dpi/72)
+    pix = page.get_pixmap(matrix=mat, alpha=False)
+    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    return img, mat
+def pdf_to_px_bbox(bbox_pdf, mat):
+    x0, y0, x1, y1 = bbox_pdf
+    sx, sy = mat.a, mat.d
+    return (int(float(x0) * sx), int(float(y0) * sy), int(float(x1) * sx), int(float(y1) * sy))
+def safe_crop_px(pil_img, box):
+    if not isinstance(box, (tuple, list)):
+        return None
+    if len(box) == 1 and isinstance(box[0], (tuple, list)) and len(box[0]) == 4:
+        box = box[0]
+    if len(box) != 4:
+        return None
+    x0, y0, x1, y1 = box
+    if any(isinstance(v, (tuple, list)) for v in (x0, y0, x1, y1)):
+        return None
+    try:
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+    except (TypeError, ValueError):
+        return None
+    if x1 < x0: x0, x1 = x1, x0
+    if y1 < y0: y0, y1 = y1, y0
+    W, H = pil_img.size
+    x0 = max(0, min(W, x0))
+    x1 = max(0, min(W, x1))
+    y0 = max(0, min(H, y0))
+    y1 = max(0, min(H, y1))
+    if x1 <= x0 or y1 <= y0:
+        return None
+    return pil_img.crop((x0, y0, x1, y1))
+def find_caption_blocks(page):
+    caps = []
+    blocks = page.get_text("blocks")
+    for b in blocks:
+        x0, y0, x1, y1, text = b[0], b[1], b[2], b[3], b[4]
+        t = " ".join(str(text).strip().split())
+        if CAP_RE.match(t):
+            caps.append({"bbox": (x0, y0, x1, y1), "text": t})
+    return caps
+def dhash64(pil_img):
+    gray = pil_img.convert("L").resize((9, 8), Image.LANCZOS)
+    pixels = list(gray.getdata())
+    bits = 0
+    for r in range(8):
+        for c in range(8):
+            left = pixels[r * 9 + c]
+            right = pixels[r * 9 + c + 1]
+            bits = (bits << 1) | (1 if left > right else 0)
+    return bits
+def has_colorbar_like_strip(pil_img):
+    img = np.array(pil_img)
+    if img.ndim != 3:
+        return False
+    H, W, _ = img.shape
+    if W < 250 or H < 150:
+        return False
+    strip_w = max(18, int(0.07 * W))
+    strip = img[:, W-strip_w:W, :]
+    q = (strip // 24).reshape(-1, 3)
+    uniq = np.unique(q, axis=0)
+    return len(uniq) > 70
+def texture_score(pil_img):
+    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
+    lap = cv2.Laplacian(gray, cv2.CV_64F)
+    return float(lap.var())
+def is_mostly_legend(pil_img):
+    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
+    bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    bw = cv2.medianBlur(bw, 3)
+    H, W = bw.shape
+    fill = float(np.count_nonzero(bw)) / float(H * W)
+    return (0.03 < fill < 0.18) and (min(H, W) < 260)
+def detect_axes_lines(pil_img):
+    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
+    edges = cv2.Canny(gray, 50, 150)
+    H, W = gray.shape
+    min_len = int(0.28 * min(H, W))
+    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=90, minLineLength=min_len, maxLineGap=14)
+    if lines is None:
+        return None, None
+    horizontals, verticals = [], []
+    for x1, y1, x2, y2 in lines[:, 0]:
+        dx, dy = abs(x2-x1), abs(y2-y1)
+        length = math.hypot(dx, dy)
+        if dy < 18 and dx > 0.35 * W:
+            horizontals.append((length, (x1, y1, x2, y2)))
+        if dx < 18 and dy > 0.35 * H:
+            verticals.append((length, (x1, y1, x2, y2)))
+    if not horizontals or not verticals:
+        return None, None
+    horizontals.sort(key=lambda t: t[0], reverse=True)
+    verticals.sort(key=lambda t: t[0], reverse=True)
+    return horizontals[0][1], verticals[0][1]
+def axis_intersection_ok(x_axis, y_axis, W, H):
+    xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
+    ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
+    if not (0 <= xa_y < H and 0 <= ya_x < W):
+        return False
+    if ya_x > int(0.95 * W) or xa_y < int(0.05 * H):
+        return False
+    return True
+def tick_text_presence_score(pil_img, x_axis, y_axis):
+    img = np.array(pil_img)
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    bw = cv2.medianBlur(bw, 3)
+    H, W = gray.shape
+    xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
+    ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
+    y0a = max(0, xa_y - 40)
+    y1a = min(H, xa_y + 110)
+    x_roi = bw[y0a:y1a, 0:W]
+    x0b = max(0, ya_x - 180)
+    x1b = min(W, ya_x + 50)
+    y_roi = bw[0:H, x0b:x1b]
+    def count_small_components(mask):
+        num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
+        cnt = 0
+        for i in range(1, num):
+            x, y, w, h, area = stats[i]
+            if 4 <= w <= 150 and 4 <= h <= 150 and 20 <= area <= 5000:
+                cnt += 1
+        return cnt
+    return count_small_components(x_roi) + count_small_components(y_roi)
+def is_real_plot(pil_img):
+    if has_colorbar_like_strip(pil_img):
+        return False
+    if is_mostly_legend(pil_img):
+        return False
+    x_axis, y_axis = detect_axes_lines(pil_img)
+    if x_axis is None or y_axis is None:
+        return False
+    arr = np.array(pil_img)
+    H, W = arr.shape[0], arr.shape[1]
+    if not axis_intersection_ok(x_axis, y_axis, W, H):
+        return False
+    if texture_score(pil_img) > 2200:
+        return False
+    score = tick_text_presence_score(pil_img, x_axis, y_axis)
+    return score >= 18
+def connected_components_boxes(pil_img):
+    img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    mask = (gray < 245).astype(np.uint8) * 255
+    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((7, 7), np.uint8), iterations=2)
+    num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
+    boxes = []
+    for i in range(1, num):
+        x, y, w, h, area = stats[i]
+        boxes.append((int(area), (int(x), int(y), int(x + w), int(y + h))))
+    boxes.sort(key=lambda t: t[0], reverse=True)
+    return boxes
+def expand_box(box, W, H, left=0.10, right=0.06, top=0.06, bottom=0.18):
+    x0, y0, x1, y1 = box
+    bw = x1 - x0
+    bh = y1 - y0
+    ex0 = max(0, int(x0 - left * bw))
+    ex1 = min(W, int(x1 + right * bw))
+    ey0 = max(0, int(y0 - top * bh))
+    ey1 = min(H, int(y1 + bottom * bh))
+    return (ex0, ey0, ex1, ey1)
+def crop_plot_from_caption(page_img, cap_bbox_pdf, mat):
+    cap_px = pdf_to_px_bbox(cap_bbox_pdf, mat)
+    cap_y0 = cap_px[1]
+    cap_y1 = cap_px[3]
+    W, H = page_img.size
+    search_top = max(0, cap_y0 - int(0.95 * H))
+    search_bot = min(H, cap_y1 + int(0.20 * H))
+    region = safe_crop_px(page_img, (0, search_top, W, search_bot))
+    if region is None:
+        return None
+    comps = connected_components_boxes(region)
+    best = None
+    best_area = -1
+    for area, box in comps[:35]:
+        x0, y0, x1, y1 = box
+        bw = x1 - x0
+        bh = y1 - y0
+        if bw < 220 or bh < 180:
+            continue
+        exp = expand_box(box, region.size[0], region.size[1])
+        cand = safe_crop_px(region, exp)
+        if cand is None:
+            continue
+        if not is_real_plot(cand):
+            continue
+        if area > best_area:
+            best_area = area
+            best = cand
+    return best
+def extract_images(pdf_path, paper_id="uploaded_paper"):
+    """Extract plot images from PDF"""
+    out_paper = os.path.join(OUT_DIR, paper_id)
+    out_imgs = os.path.join(out_paper, "plots_with_axes")
+    os.makedirs(out_imgs, exist_ok=True)
+    doc = fitz.open(pdf_path)
+    results = []
+    seen = set()
+    saved = 0
+    for p in range(len(doc)):
+        page = doc[p]
+        caps = find_caption_blocks(page)
+        if not caps:
+            continue
+        page_img, mat = render_page(page, dpi=DPI)
+        for cap in caps:
+            cap_text = cap["text"]
+            if KEEP_ONLY_STRESS_STRAIN and not SS_KW.search(cap_text):
+                continue
+            fig = crop_plot_from_caption(page_img, cap["bbox"], mat)
+            if fig is None:
+                continue
+            if fig.size[0] > 8 and fig.size[1] > 8:
+                fig = fig.crop((2, 2, fig.size[0]-2, fig.size[1]-2))
+            try:
+                h = dhash64(fig)
+            except Exception:
+                continue
+            if h in seen:
+                continue
+            seen.add(h)
+            img_name = f"p{p+1:02d}_{saved:04d}.png"
+            img_path = os.path.join(out_imgs, img_name)
+            fig.save(img_path)
+            results.append({
+                "page": p + 1,
+                "caption": cap_text,
+                "image": img_path
+            })
+            saved += 1
+    return results
+def input_form():
+    PROPERTY_CATEGORIES = {
+        "Polymer": [
+            "Thermal",
+            "Mechanical",
+            "Processing",
+            "Physical",
+            "Descriptive",
+        ],
+        "Fiber": [
+            "Mechanical",
+            "Physical",
+            "Thermal",
+            "Descriptive",
+        ],
+        "Composite": [
+            "Mechanical",
+            "Thermal",
+            "Processing",
+            "Physical",
+            "Descriptive",
+            "Composition / Reinforcement",
+            "Architecture / Structure",
+        ],
+    }
+    PROPERTY_NAMES = {
+        "Polymer": {
+            "Thermal": [
+                "Glass transition temperature (Tg)",
+                "Melting temperature (Tm)",
+                "Crystallization temperature (Tc)",
+                "Degree of crystallinity",
+                "Decomposition temperature",
+            ],
+            "Mechanical": [
+                "Tensile modulus",
+                "Tensile strength",
+                "Elongation at break",
+                "Flexural modulus",
+                "Impact strength",
+            ],
+            "Processing": [
+                "Melt flow index (MFI)",
+                "Processing temperature",
+                "Cooling rate",
+                "Mold shrinkage",
+            ],
+            "Physical": [
+                "Density",
+                "Specific gravity",
+            ],
+            "Descriptive": [
+                "Material grade",
+                "Manufacturer",
+            ],
+        },
+        "Fiber": {
+            "Mechanical": [
+                "Tensile modulus",
+                "Tensile strength",
+                "Strain to failure",
+            ],
+            "Physical": [
+                "Density",
+                "Fiber diameter",
+            ],
+            "Thermal": [
+                "Decomposition temperature",
+            ],
+            "Descriptive": [
+                "Fiber type",
+                "Surface treatment",
+            ],
+        },
+        "Composite": {
+            "Mechanical": [
+                "Longitudinal modulus (E1)",
+                "Transverse modulus (E2)",
+                "Shear modulus (G12)",
+                "Poissons ratio (V12)",
+                "Tensile strength (fiber direction)",
+                "Interlaminar shear strength",
+            ],
+            "Thermal": [
+                "Glass transition temperature (matrix)",
+                "Coefficient of thermal expansion (CTE)",
+            ],
+            "Processing": [
+                "Curing temperature",
+                "Curing pressure",
+            ],
+            "Physical": [
+                "Density",
+            ],
+            "Descriptive": [
+                "Laminate type",
+            ],
+            "Composition / Reinforcement": [
+                "Fiber volume fraction",
+                "Fiber weight fraction",
+                "Fiber type",
+                "Matrix type",
+            ],
+            "Architecture / Structure": [
+                "Weave type",
+                "Ply orientation",
+                "Number of plies",
+                "Stacking sequence",
+            ],
+        },
+    }
+    st.title("Materials Property Input Form")
+    material_class = st.selectbox(
+        "Select Material Class",
+        ("Polymer", "Fiber", "Composite"),
+        index=None,
+        placeholder="Choose material class",
+    )
+    if material_class:
+        property_category = st.selectbox(
+            "Select Property Category",
+            PROPERTY_CATEGORIES[material_class],
+            index=None,
+            placeholder="Choose property category",
+        )
+    else:
+        property_category = None
+    if material_class and property_category:
+        property_name = st.selectbox(
+            "Select Property",
+            PROPERTY_NAMES[material_class][property_category],
+            index=None,
+            placeholder="Choose property",
+        )
+    else:
+        property_name = None
+    if material_class and property_category and property_name:
+        with st.form("user_input"):
+            st.subheader("Enter Data")
+            material_name = st.text_input("Material Name")
+            material_abbr = st.text_input("Material Abbreviation")
+            value = st.text_input("Value")
+            unit = st.text_input("Unit (SI)")
+            english = st.text_input("English Units")
+            test_condition = st.text_input("Test Condition")
+            comments = st.text_area("Comments")
+            submitted = st.form_submit_button("Submit")
+            if submitted:
+                if not (material_name and value):
+                    st.error("Material name and value are required.")
+                else:
+                    Input_db = pd.DataFrame([{
+                        "material_class": material_class,
+                        "material_name": material_name,
+                        "material_abbreviation": material_abbr,
+                        "section": property_category,
+                        "property_name": property_name,
+                        "value": value,
+                        "unit": unit,
+                        "english_units": english,
+                        "test_condition": test_condition,
+                        "comments": comments
+                    }])
+                    st.success("Property added successfully")
+                    st.dataframe(Input_db)
+                    if "user_uploaded_data" not in st.session_state:
+                        st.session_state["user_uploaded_data"] = Input_db
+                    else:
+                        st.session_state["user_uploaded_data"] = pd.concat(
+                        [st.session_state["user_uploaded_data"], Input_db],
+                        ignore_index=True
+                        )
+def main():
+    input_form()
+    st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
+    st.title("PDF Material Data & Plot Extractor")
+    uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
+    if not uploaded_file:
+        st.info("Upload a PDF to extract material data and plots")
+        return
+    paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
+    tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdf_path = os.path.join(tmpdir, uploaded_file.name)
+        with open(pdf_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        with tab1:
+            st.subheader("Material Properties Data")
+            with st.spinner(" Extracting material data..."):
+                with open(pdf_path, "rb") as f:
+                    pdf_bytes = f.read()
+                data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
+                if data:
+                    df = convert_to_dataframe(data)
+                    if not df.empty:
+                        st.success(f"Extracted {len(df)} properties")
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.metric("Material", data.get("material_name", "N/A"))
+                        with col2:
+                            st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
+                        st.dataframe(df, use_container_width=True, height=400)
+                        st.subheader("Assign Material Category")
+                        extracted_material_class = st.selectbox(
+                            "Select category for this material",
+                            ["Polymer", "Fiber", "Composite"],
+                            index=None,
+                            placeholder="Required before adding to database"
+                        )
+                        if st.button(" Add to Database"):
+                            if not extracted_material_class:
+                                st.error("Please select a material category before adding.")
+                            else:
+                                df["material_class"] = extracted_material_class
+                                if "user_uploaded_data" not in st.session_state:
+                                    st.session_state["user_uploaded_data"] = df
+                                else:
+                                    st.session_state["user_uploaded_data"] = pd.concat(
+                                        [st.session_state["user_uploaded_data"], df],
+                                        ignore_index=True
+                                    )
+                                st.success(f"Added to {extracted_material_class} database!")
+                        # if st.button(" Add to Database"):
+                        #     if "user_uploaded_data" not in st.session_state:
+                        #         st.session_state["user_uploaded_data"] = df
+                        #     else:
+                        #         st.session_state["user_uploaded_data"] = pd.concat(
+                        #             [st.session_state["user_uploaded_data"], df],
+                        #             ignore_index=True
+                        #         )
+                        #     st.success("Added to database!")
+                        csv = df.to_csv(index=False)
+                        st.download_button(
+                            "Download CSV",
+                            data=csv,
+                            file_name=f"{paper_id}_data.csv",
+                            mime="text/csv"
+                        )
+                    else:
+                        st.warning("No data extracted")
+                else:
+                    st.error("Failed to extract data from PDF")
+        with tab2:
+            st.subheader("Extracted Plot Images")
+            with st.spinner(" Extracting plots from PDF..."):
+                image_results = extract_images(pdf_path, paper_id=paper_id)
+            if image_results:
+                st.success(f" Extracted {len(image_results)} plots")
+                for r in image_results:
+                    st.markdown(f"**Page {r['page']}** — {r['caption']}")
+                    st.image(r["image"], use_container_width=True)
+                    st.divider()
+            else:
+                st.warning("No plots found in PDF")
+if __name__ == "__main__":
+    main()

src/pages/categorized/__pycache__/page1.cpython-312.pyc ADDED Viewed

Binary file (4.86 kB). View file

src/pages/categorized/__pycache__/page1.cpython-313.pyc ADDED Viewed

Binary file (4.94 kB). View file

src/pages/categorized/__pycache__/page1.cpython-314.pyc ADDED Viewed

Binary file (9.83 kB). View file

src/pages/categorized/__pycache__/page2.cpython-312.pyc ADDED Viewed

Binary file (596 Bytes). View file

src/pages/categorized/__pycache__/page2.cpython-313.pyc ADDED Viewed

Binary file (596 Bytes). View file

src/pages/categorized/__pycache__/page2.cpython-314.pyc ADDED Viewed

Binary file (672 Bytes). View file

src/pages/categorized/__pycache__/page3.cpython-313.pyc ADDED Viewed

Binary file (596 Bytes). View file

src/pages/categorized/__pycache__/page3.cpython-314.pyc ADDED Viewed

Binary file (2.93 kB). View file

src/pages/categorized/__pycache__/page6.cpython-314.pyc ADDED Viewed

Binary file (34 kB). View file

src/pages/categorized/__pycache__/page6.cpython-314.pyc.2029864538672 ADDED Viewed

Binary file (8.01 kB). View file

src/pages/categorized/__pycache__/page6.cpython-314.pyc.2097035857760 ADDED Viewed

Binary file (1.22 kB). View file

src/pages/categorized/page1.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import streamlit as st
+import pandas as pd
+from PIL import Image
+import re
+def extract_matrix_fiber_from_abbr(abbr: str):
+    if not isinstance(abbr, str):
+        return None, None
+    text = abbr.lower()
+    matrix_map = {
+        "epoxy": "Epoxy",
+        "cyanate ester": "Cyanate Ester",
+        "cynate ester": "Cyanate Ester",
+        "polypropylene": "Polypropylene",
+        "pp": "Polypropylene",
+        "peek": "PEEK",
+        "pei": "PEI",
+        "nylon": "Nylon",
+        "pa6": "PA6",
+        "polyester": "Polyester",
+        "vinyl ester": "Vinyl Ester",
+        "phenolic": "Phenolic"
+    }
+    matrix = None
+    for key, val in matrix_map.items():
+        if key in text:
+            matrix = val
+            break
+    fiber_map = {
+        "carbon": "Carbon Fiber",
+        "glass": "Glass Fiber",
+        "e-glass": "E-Glass Fiber",
+        "s-glass": "S-Glass Fiber",
+        "aramid": "Aramid Fiber",
+        "kevlar": "Kevlar Fiber",
+        "basalt": "Basalt Fiber",
+        "natural": "Natural Fiber"
+    }
+    fiber = None
+    for key, val in fiber_map.items():
+        if key in text:
+            fiber = val
+            break
+    return matrix, fiber
+def main():
+    st.set_page_config(layout="wide")
+    mat_section = st.sidebar.expander("Materials", expanded=False)
+    with mat_section:
+        thermo = mat_section.button("Composites")
+        polymers = mat_section.button("Polymers")
+        Fibers = mat_section.button("Fibers")
+    if "material_type" not in st.session_state:
+        st.session_state.material_type = "Composites"
+    if thermo:
+        st.session_state.material_type = "Composites"
+    elif polymers:
+        st.session_state.material_type = "Polymers"
+    elif Fibers:
+        st.session_state.material_type = "Fibers"
+    @st.cache_data
+    def load_data(material_type):
+        file_map = {
+            "Composites": "data/Composites_material_data.csv",
+            "Polymers": "data/polymers_material_data.csv",
+            "Fibers": "data/Fibers_material_data.csv",
+        }
+        return pd.read_csv(file_map[material_type])
+    csv_data = load_data(st.session_state.material_type)
+    # if "user_uploaded_data" in st.session_state:
+    #      df = pd.concat([csv_data, st.session_state["user_uploaded_data"]], ignore_index=True)
+    # else:
+    #      df = csv_data
+    # Normalize naming between pages
+    CLASS_MAP = {
+        "Polymers": "Polymer",
+        "Fibers": "Fiber",
+        "Composites": "Composite",
+    }
+    current_class = CLASS_MAP[st.session_state.material_type]
+    if "user_uploaded_data" in st.session_state:
+        user_df = st.session_state["user_uploaded_data"]
+        filtered_user_df = user_df[
+            user_df["material_class"] == current_class
+        ]
+        df = pd.concat([csv_data, filtered_user_df], ignore_index=True)
+    else:
+        df = csv_data
+    st.session_state["base_data"] = df
+    st.title("Materials DataSet")
+    materials_df = (
+        df[["material_abbreviation", "material_name"]]
+        .fillna("")
+        .drop_duplicates()
+        .reset_index(drop=True)
+    )
+    materials_df[["Matrix", "Fiber"]] = materials_df["material_abbreviation"].apply(
+    lambda x: pd.Series(extract_matrix_fiber_from_abbr(x))
+    )
+    col1, col2 = st.columns(2, vertical_alignment="center")
+    # st.subheader("Filter Composites")
+    # matrix_options = sorted(
+    #     materials_df["Matrix"].dropna().unique()
+    # )
+    # fiber_options = sorted(
+    #     materials_df["Fiber"].dropna().unique()
+    # )
+    # fcol1, fcol2 = st.columns(2)
+    # with fcol1:
+    #     selected_matrix = st.selectbox(
+    #         "Matrix Material",
+    #         ["All"] + matrix_options
+    #     )
+    # with fcol2:
+    #     selected_fiber = st.selectbox(
+    #         "Fiber Material",
+    #         ["All"] + fiber_options
+    #     )
+    # filtered_materials_df = materials_df.copy()
+    # if selected_matrix != "All":
+    #     filtered_materials_df = filtered_materials_df[
+    #         filtered_materials_df["Matrix"] == selected_matrix
+    #     ]
+    # if selected_fiber != "All":
+    #     filtered_materials_df = filtered_materials_df[
+    #         filtered_materials_df["Fiber"] == selected_fiber
+    #     ]
+    with col1:
+        st.write("Filter Composites")
+        selected_matrix = "All"
+        selected_fiber = "All"
+        if st.session_state.material_type == "Composites":
+            matrix_options = sorted(
+                materials_df["Matrix"].dropna().unique()
+            )
+            fiber_options = sorted(
+                materials_df["Fiber"].dropna().unique()
+            )
+            fcol1, fcol2 = st.columns(2)
+            with fcol1:
+                selected_matrix = st.selectbox(
+                    "Matrix Material",
+                    ["All"] + matrix_options
+                )
+            with fcol2:
+                selected_fiber = st.selectbox(
+                    "Fiber Material",
+                    ["All"] + fiber_options
+                )
+        filtered_materials_df = materials_df.copy()
+        if st.session_state.material_type == "Composites":
+            if selected_matrix != "All":
+                filtered_materials_df = filtered_materials_df[
+                    filtered_materials_df["Matrix"] == selected_matrix
+                ]
+        if selected_fiber != "All":
+            filtered_materials_df = filtered_materials_df[
+                filtered_materials_df["Fiber"] == selected_fiber
+            ]
+        st.write("Select Material")
+        st.dataframe(
+            filtered_materials_df,
+            key="material_table",
+            selection_mode="single-cell",
+            on_select="rerun",
+            use_container_width=True,
+            height=260
+        )
+    def get_selected_value(df, key, column_name):
+        if key in st.session_state:
+            sel = st.session_state[key]["selection"]["cells"]
+            if sel:
+                row_idx = sel[0][0]
+                return df.iloc[row_idx][column_name]
+        return None
+    mat = get_selected_value(materials_df, "material_table", "material_abbreviation")
+    with col2:
+        st.write("Select Property")
+        if mat:
+            filtered_df = df[
+                (df["material_abbreviation"] == mat) &
+                (df["value"].notna()) &
+                (df["property_name"].notna())
+            ]
+            property_sel = st.selectbox(
+                "Type of Property",
+                filtered_df["section"].drop_duplicates()
+            )
+            properties_df = (
+                filtered_df[filtered_df["section"] == property_sel][["property_name", "section"]]
+                .drop_duplicates()
+                .reset_index(drop=True)
+            )
+        else:
+            filtered_df = df[df["value"].notna() & df["property_name"].notna()]
+            property_sel = st.selectbox(
+                "Type of Property",
+                filtered_df["section"].drop_duplicates()
+            )
+            properties_df = (
+                filtered_df[filtered_df["section"] == property_sel][["property_name", "section"]]
+                .drop_duplicates()
+                .reset_index(drop=True)
+            )
+        st.dataframe(
+            properties_df,
+            key="property_table",
+            selection_mode="single-cell",
+            on_select="rerun",
+            use_container_width=True,
+            height=260
+        )
+    prop = get_selected_value(properties_df, "property_table", "property_name")
+    st.write("")
+    if st.button("Search", disabled=not (mat and prop)):
+        st.write(f"**Material:** {mat}")
+        st.write(f"**Property:** {prop}")
+        result = df[
+            (df["material_abbreviation"] == mat) &
+            (df["property_name"] == prop) &
+            (df["value"].notna())
+        ]
+        if not result.empty:
+            st.subheader("Property Data")
+            st.dataframe(result.T, use_container_width=True)
+            st.subheader("Property Graph")
+            img_path = f"images/{mat}_{prop}.png"
+            try:
+                img = Image.open(img_path)
+                st.image(img, use_container_width=True, caption="Stress strain curve")
+            except FileNotFoundError:
+                st.write("")
+                # fallback_img = Image.open("pages/categorized/ESS-min.jpg")
+                # st.image(fallback_img, use_container_width=True, caption="Stress strain curve")
+        else:
+            st.warning("No data found for this material-property combination")

src/pages/categorized/page2.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import streamlit as st
+import pandas as pd
+import os
+from PIL import Image
+import boto3
+import tabula
+import faiss
+import json
+import base64
+import pymupdf
+import requests
+import os
+import logging
+import numpy as np
+import warnings
+from tqdm import tqdm
+from botocore.exceptions import ClientError
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from IPython import display
+from langchain_aws import ChatBedrock
+from pathlib import Path
+def main():
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.ERROR)
+    warnings.filterwarnings("ignore")
+    def create_directories(base_dir):
+        directories = ["images", "text", "tables", "page_images"]
+        for dir in directories:
+            os.makedirs(os.path.join(base_dir, dir), exist_ok=True)
+    def process_tables(doc, page_num, base_dir, items):
+        try:
+            tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
+            if not tables:
+                return
+            for table_idx, table in enumerate(tables):
+                table_text = "\n".join([" | ".join(map(str, row)) for row in table.values])
+                table_file_name = f"{base_dir}/tables/{os.path.basename(filepath)}_table_{page_num}_{table_idx}.txt"
+                with open(table_file_name, 'w') as f:
+                    f.write(table_text)
+                items.append({"page": page_num, "type": "table", "text": table_text, "path": table_file_name})
+        except Exception as e:
+            print(f"Error extracting tables from page {page_num}: {str(e)}")
+        doc = pymupdf.open(filepath)
+        num_pages = len(doc)
+        base_dir = "data"
+        # Creating the directories
+        create_directories(base_dir)
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200, length_function=len)
+        items = []
+        # Process each page of the PDF
+        for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
+            page = doc[page_num]
+            process_tables(doc, page_num, base_dir, items)
+        [i for i in items if i['type'] == 'table'][0]
+        # Generating Multimodal Embeddings using Amazon Titan Multimodal Embeddings model
+    def generate_multimodal_embeddings(prompt=None, image=None, output_embedding_length=384):
+        """
+        Invoke the Amazon Titan Multimodal Embeddings model using Amazon Bedrock runtime.
+        Args:
+            prompt (str): The text prompt to provide to the model.
+            image (str): A base64-encoded image data.
+        Returns:
+            str: The model's response embedding.
+        """
+        if not prompt and not image:
+            raise ValueError("Please provide either a text prompt, base64 image, or both as input")
+        # Initialize the Amazon Bedrock runtime client
+        client = boto3.client(service_name="bedrock-runtime")
+        model_id = "amazon.titan-embed-image-v1"
+        body = {"embeddingConfig": {"outputEmbeddingLength": output_embedding_length}}
+        if prompt:
+            body["inputText"] = prompt
+        if image:
+            body["inputImage"] = image
+        try:
+            response = client.invoke_model(
+                modelId=model_id,
+                body=json.dumps(body),
+                accept="application/json",
+                contentType="application/json"
+            )
+            # Process and return the response
+            result = json.loads(response.get("body").read())
+            return result.get("embedding")
+        except ClientError as err:
+            print(f"Couldn't invoke Titan embedding model. Error: {err.response['Error']['Message']}")
+            return None
+        # Set embedding vector dimension
+    embedding_vector_dimension = 384
+    # Count the number of each type of item
+    item_counts = {
+        'text': sum(1 for item in items if item['type'] == 'text'),
+        'table': sum(1 for item in items if item['type'] == 'table'),
+        'image': sum(1 for item in items if item['type'] == 'image'),
+        'page': sum(1 for item in items if item['type'] == 'page')
+    }
+    # Initialize counters
+    counters = dict.fromkeys(item_counts.keys(), 0)
+    # Generate embeddings for all items
+    with tqdm(
+        total=len(items),
+        desc="Generating embeddings",
+        bar_format=(
+            "{l_bar}{bar}| {n_fmt}/{total_fmt} "
+            "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
+        )
+    ) as pbar:
+        for item in items:
+            item_type = item['type']
+            counters[item_type] += 1
+            if item_type in ['text', 'table']:
+                # For text or table, use the formatted text representation
+                item['embedding'] = generate_multimodal_embeddings(prompt=item['text'],output_embedding_length=embedding_vector_dimension)
+            else:
+                # For images, use the base64-encoded image data
+                item['embedding'] = generate_multimodal_embeddings(image=item['image'], output_embedding_length=embedding_vector_dimension)
+            # Update the progress bar
+            pbar.set_postfix_str(f"Text: {counters['text']}/{item_counts['text']}, Table: {counters['table']}/{item_counts['table']}, Image: {counters['image']}/{item_counts['image']}")
+            pbar.update(1)
+            # All the embeddings
+    all_embeddings = np.array([item['embedding'] for item in items])
+    # Create FAISS Index
+    index = faiss.IndexFlatL2(embedding_vector_dimension)
+    # Clear any pre-existing index
+    index.reset()
+    # Add embeddings to the index
+    index.add(np.array(all_embeddings, dtype=np.float32))
+    # Generating RAG response with Amazon Nova
+    def invoke_nova_multimodal(prompt, matched_items):
+        """
+        Invoke the Amazon Nova model.
+        """
+        # Define your system prompt(s).
+        system_msg = [
+                            { "text": """You are a helpful assistant for question answering.
+                                        The text context is relevant information retrieved.
+                                        The provided image(s) are relevant information retrieved."""}
+                    ]
+        # Define one or more messages using the "user" and "assistant" roles.
+        message_content = []
+        for item in matched_items:
+            if item['type'] == 'text' or item['type'] == 'table':
+                message_content.append({"text": item['text']})
+            else:
+                message_content.append({"image": {
+                                                    "format": "png",
+                                                    "source": {"bytes": item['image']},
+                                                }
+                                        })
+        # Configure the inference parameters.
+        inf_params = {"max_new_tokens": 300,
+                    "top_p": 0.9,
+                    "top_k": 20}
+        # Define the final message list
+        message_list = [
+            {"role": "user", "content": message_content}
+        ]
+        # Adding the prompt to the message list
+        message_list.append({"role": "user", "content": [{"text": prompt}]})
+        native_request = {
+            "messages": message_list,
+            "system": system_msg,
+            "inferenceConfig": inf_params,
+        }
+        # Initialize the Amazon Bedrock runtime client
+        model_id = "amazon.nova-pro-v1:0"
+        client = ChatBedrock(model_id=model_id)
+        # Invoke the model and extract the response body.
+        response = client.invoke(json.dumps(native_request))
+        model_response = response.content
+        return model_response
+    # User Query
+    query = "Which optimizer was used when training the models?"
+    # Generate embeddings for the query
+    query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)
+    # Search for the nearest neighbors in the vector database
+    distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)
+    # Check the result (matched chunks)
+    result.flatten()
+    # Retrieve the matched items
+    matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]
+    # Generate RAG response with Amazon Nova
+    response = invoke_nova_multimodal(query, matched_items)
+    display.Markdown(response)
+    # List of queries (Replace with any query of your choice)
+    other_queries = ["How long were the base and big models trained?",
+                    "Which optimizer was used when training the models?",
+                    "What is the position-wise feed-forward neural network mentioned in the paper?",
+                    "What is the BLEU score of the model in English to German translation (EN-DE)?",
+                    "How is the scaled-dot-product attention is calculated?",
+                    ]
+    query = other_queries[0] # Replace with any query from the list above
+    # Generate embeddings for the query
+    query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)
+    # Search for the nearest neighbors in the vector database
+    distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)
+    # Retrieve the matched items
+    matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]
+    # Generate RAG response with Amazon Nova
+    response = invoke_nova_multimodal(query, matched_items)
+    # Display the response
+    display.Markdown(response)

src/pages/categorized/page3.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import streamlit as st
+import pandas as pd
+import tabula
+import pymupdf
+import os
+from tqdm import tqdm
+def extract_tables_pymupdf(pdf_path):
+    """Extract tables using PyMuPDF (alternative method)"""
+    try:
+        doc = pymupdf.open(pdf_path)
+        all_tables = []
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            tables = page.find_tables()
+            for table in tables:
+                # Extract table data
+                table_data = table.extract()
+                if table_data:
+                    # Convert to DataFrame
+                    df = pd.DataFrame(table_data[1:], columns=table_data[0])
+                    all_tables.append({
+                        'page': page_num + 1,
+                        'dataframe': df
+                    })
+        doc.close()
+        return all_tables
+    except Exception as e:
+        st.error(f"Error extracting tables with PyMuPDF: {e}")
+        return []
+def main():
+    st.title("PDF Table Extractor")
+    st.write("Upload a PDF to extract all tables")
+    temp_path = "temp_uploaded.pdf"  # Define here
+    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+    if uploaded_file is not None:
+        # Save uploaded file temporarily
+        with open(temp_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        # Using PyMuPDF
+        tables = extract_tables_pymupdf(temp_path)
+        if tables:
+            st.success(f"Found {len(tables)} tables!")
+            for idx, table_info in enumerate(tables):
+                st.subheader(f"Table {idx + 1} (Page {table_info['page']})")
+                df = table_info['dataframe']
+                st.dataframe(df, use_container_width=True)
+        # Clean up temp file
+        if os.path.exists(temp_path):
+            os.remove(temp_path)

src/pages/categorized/page4.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import streamlit as st
+from pathlib import Path
+def main():
+    st.write(f'# {Path(__file__).parent.name} - {Path(__file__).name}')

src/pages/categorized/page5.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import streamlit as st
+from pathlib import Path
+def main():
+    st.write(f'# {Path(__file__).parent.name} - {Path(__file__).name}')

src/pages/categorized/page6.py ADDED Viewed

	@@ -0,0 +1,671 @@

+import os
+import re
+import json
+import tempfile
+import zipfile
+from io import BytesIO
+import fitz  # PyMuPDF
+import cv2
+import numpy as np
+import streamlit as st
+import pandas as pd
+import requests
+import base64
+from typing import Dict, Any, Optional
+from collections import defaultdict
+API_KEY = "AIzaSyCD5_sTXRhr4cpBrM08V7UhWNNc1KmaI9I"
+API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
+SCHEMA = {
+    "type": "OBJECT",
+    "properties": {
+        "material_name": {"type": "STRING"},
+        "material_abbreviation": {"type": "STRING"},
+        "mechanical_properties": {
+            "type": "ARRAY",
+            "items": {
+                "type": "OBJECT",
+                "properties": {
+                    "section": {"type": "STRING"},
+                    "property_name": {"type": "STRING"},
+                    "value": {"type": "STRING"},
+                    "unit": {"type": "STRING"},
+                    "english": {"type": "STRING"},
+                    "test_condition": {"type": "STRING"},
+                    "comments": {"type": "STRING"}
+                },
+                "required": ["section", "property_name", "value", "english", "comments"]
+            }
+        }
+    }
+}
+def make_abbreviation(name: str) -> str:
+    """Create a simple abbreviation from the material name."""
+    if not name:
+        return "UNKNOWN"
+    words = name.split()
+    abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
+    return abbr or name[:6].upper()
+DPI = 300
+CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
+def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
+    """Calls Gemini API with PDF bytes"""
+    try:
+        encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
+        mime_type = "application/pdf"
+    except Exception as e:
+        st.error(f"Error encoding PDF: {e}")
+        return None
+    prompt = (
+        "You are an expert materials scientist. From the attached PDF, extract the material name, "
+        "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
+        "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
+        "For each property, you MUST extract:\n"
+        "- property_name\n- value (or range)\n- unit\n"
+        "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
+        "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
+        "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
+    )
+    payload = {
+        "contents": [{
+            "parts": [
+                {"text": prompt},
+                {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
+            ]
+        }],
+        "generationConfig": {
+            "temperature": 0,
+            "responseMimeType": "application/json",
+            "responseSchema": SCHEMA
+        }
+    }
+    try:
+        r = requests.post(API_URL, json=payload, timeout=300)
+        r.raise_for_status()
+        data = r.json()
+        candidates = data.get("candidates", [])
+        if not candidates:
+            return None
+        parts = candidates[0].get("content", {}).get("parts", [])
+        json_text = None
+        for p in parts:
+            t = p.get("text", "")
+            if t.strip().startswith("{"):
+                json_text = t
+                break
+        return json.loads(json_text) if json_text else None
+    except Exception as e:
+        st.error(f"Gemini API Error: {e}")
+        return None
+def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
+    """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
+    mat_name = data.get("material_name", "") or ""
+    mat_abbr = data.get("material_abbreviation", "") or ""
+    if not mat_abbr:
+        mat_abbr = make_abbreviation(mat_name)
+    rows = []
+    for item in data.get("mechanical_properties", []):
+        rows.append({
+            "material_name": mat_name,
+            "material_abbreviation": mat_abbr,
+            "section": item.get("section", "") or "Mechanical",
+            "property_name": item.get("property_name", "") or "Unknown property",
+            "value": item.get("value", "") or "N/A",
+            "unit": item.get("unit", "") or "",
+            "english": item.get("english", "") or "",
+            "test_condition": item.get("test_condition", "") or "",
+            "comments": item.get("comments", "") or "",
+        })
+    return pd.DataFrame(rows)
+# --- IMAGE EXTRACTION LOGIC ---
+def get_page_image(page):
+    pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
+    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
+    return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+def is_valid_plot_geometry(binary_crop):
+    h, w = binary_crop.shape
+    if h < 100 or w < 100:
+        return False
+    ink_density = cv2.countNonZero(binary_crop) / (w * h)
+    if ink_density > 0.35:
+        return False
+    h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 4, 1))
+    v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 4))
+    has_h = cv2.countNonZero(cv2.erode(binary_crop, h_kernel, iterations=1)) > 0
+    has_v = cv2.countNonZero(cv2.erode(binary_crop, v_kernel, iterations=1)) > 0
+    return has_h or has_v
+def merge_boxes(rects):
+    if not rects:
+        return []
+    rects = sorted(rects, key=lambda r: r[2] * r[3], reverse=True)
+    merged = []
+    for r in rects:
+        rx, ry, rw, rh = r
+        if not any(rx >= m[0]-15 and ry >= m[1]-15 and rx+rw <= m[0]+m[2]+15 and ry+rh <= m[1]+m[3]+15 for m in merged):
+            merged.append(r)
+    return merged
+def extract_images(pdf_doc):
+    """Extract plot images from PDF using improved logic"""
+    grouped_data = defaultdict(lambda: {"page": 0, "image_data": []})
+    PADDING = 30
+    for page_num, page in enumerate(pdf_doc, start=1):
+        img_bgr = get_page_image(page)
+        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+        _, binary = cv2.threshold(gray, 225, 255, cv2.THRESH_BINARY_INV)
+        kernel = np.ones((10, 10), np.uint8)
+        dilated = cv2.dilate(binary, kernel, iterations=1)
+        contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        candidates = []
+        page_h, page_w = gray.shape
+        for cnt in contours:
+            x, y, w, h = cv2.boundingRect(cnt)
+            if 0.03 < (w * h) / (page_w * page_h) < 0.8:
+                if is_valid_plot_geometry(binary[y:y+h, x:x+w]):
+                    candidates.append((x, y, w, h))
+        final_rects = merge_boxes(candidates)
+        blocks = page.get_text("blocks")
+        for (cx, cy, cw, ch) in final_rects:
+            best_caption = f"Figure on Page {page_num} (Unlabeled)"
+            min_dist = float('inf')
+            for b in blocks:
+                text = b[4].strip()
+                if CAP_RE.match(text):
+                    cap_y = b[1] * (DPI/72)
+                    dist = cap_y - (cy + ch)
+                    if 0 < dist < (page_h * 0.3) and dist < min_dist:
+                        best_caption = text.replace('\n', ' ')
+                        min_dist = dist
+            x1, y1 = max(0, cx - PADDING), max(0, cy - PADDING)
+            x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
+            crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
+            # Store image data in memory instead of saving to disk
+            _, buffer = cv2.imencode('.png', crop)
+            img_bytes = buffer.tobytes()
+            fname = f"pg{page_num}_{cx}_{cy}.png"
+            grouped_data[best_caption]["page"] = page_num
+            grouped_data[best_caption]["image_data"].append({
+                "filename": fname,
+                "bytes": img_bytes,
+                "array": crop
+            })
+    results = [{"caption": k, "page": v["page"], "image_data": v["image_data"]} for k, v in grouped_data.items()]
+    return results
+def create_zip(results, include_json=True):
+    """Create a zip file with images and optional JSON"""
+    buf = BytesIO()
+    with zipfile.ZipFile(buf, "w") as z:
+        if include_json:
+            json_data = [{"caption": r["caption"], "page": r["page"],
+                         "image_count": len(r["image_data"])} for r in results]
+            z.writestr("plot_data.json", json.dumps(json_data, indent=4))
+        for item in results:
+            for img_data in item['image_data']:
+                z.writestr(img_data['filename'], img_data['bytes'])
+    buf.seek(0)
+    return buf.getvalue()
+def input_form():
+    PROPERTY_CATEGORIES = {
+        "Polymer": [
+            "Thermal",
+            "Mechanical",
+            "Processing",
+            "Physical",
+            "Descriptive",
+        ],
+        "Fiber": [
+            "Mechanical",
+            "Physical",
+            "Thermal",
+            "Descriptive",
+        ],
+        "Composite": [
+            "Mechanical",
+            "Thermal",
+            "Processing",
+            "Physical",
+            "Descriptive",
+            "Composition / Reinforcement",
+            "Architecture / Structure",
+        ],
+    }
+    PROPERTY_NAMES = {
+        "Polymer": {
+            "Thermal": [
+                "Glass transition temperature (Tg)",
+                "Melting temperature (Tm)",
+                "Crystallization temperature (Tc)",
+                "Degree of crystallinity",
+                "Decomposition temperature",
+            ],
+            "Mechanical": [
+                "Tensile modulus",
+                "Tensile strength",
+                "Elongation at break",
+                "Flexural modulus",
+                "Impact strength",
+            ],
+            "Processing": [
+                "Melt flow index (MFI)",
+                "Processing temperature",
+                "Cooling rate",
+                "Mold shrinkage",
+            ],
+            "Physical": [
+                "Density",
+                "Specific gravity",
+            ],
+            "Descriptive": [
+                "Material grade",
+                "Manufacturer",
+            ],
+        },
+        "Fiber": {
+            "Mechanical": [
+                "Tensile modulus",
+                "Tensile strength",
+                "Strain to failure",
+            ],
+            "Physical": [
+                "Density",
+                "Fiber diameter",
+            ],
+            "Thermal": [
+                "Decomposition temperature",
+            ],
+            "Descriptive": [
+                "Fiber type",
+                "Surface treatment",
+            ],
+        },
+        "Composite": {
+            "Mechanical": [
+                "Longitudinal modulus (E1)",
+                "Transverse modulus (E2)",
+                "Shear modulus (G12)",
+                "Poissons ratio (V12)",
+                "Tensile strength (fiber direction)",
+                "Interlaminar shear strength",
+            ],
+            "Thermal": [
+                "Glass transition temperature (matrix)",
+                "Coefficient of thermal expansion (CTE)",
+            ],
+            "Processing": [
+                "Curing temperature",
+                "Curing pressure",
+            ],
+            "Physical": [
+                "Density",
+            ],
+            "Descriptive": [
+                "Laminate type",
+            ],
+            "Composition / Reinforcement": [
+                "Fiber volume fraction",
+                "Fiber weight fraction",
+                "Fiber type",
+                "Matrix type",
+            ],
+            "Architecture / Structure": [
+                "Weave type",
+                "Ply orientation",
+                "Number of plies",
+                "Stacking sequence",
+            ],
+        },
+    }
+    st.title("Materials Property Input Form")
+    material_class = st.selectbox(
+        "Select Material Class",
+        ("Polymer", "Fiber", "Composite"),
+        index=None,
+        placeholder="Choose material class",
+    )
+    if material_class:
+        property_category = st.selectbox(
+            "Select Property Category",
+            PROPERTY_CATEGORIES[material_class],
+            index=None,
+            placeholder="Choose property category",
+        )
+    else:
+        property_category = None
+    if material_class and property_category:
+        property_name = st.selectbox(
+            "Select Property",
+            PROPERTY_NAMES[material_class][property_category],
+            index=None,
+            placeholder="Choose property",
+        )
+    else:
+        property_name = None
+    if material_class and property_category and property_name:
+        with st.form("user_input"):
+            st.subheader("Enter Data")
+            material_name = st.text_input("Material Name")
+            material_abbr = st.text_input("Material Abbreviation")
+            value = st.text_input("Value")
+            unit = st.text_input("Unit (SI)")
+            english = st.text_input("English Units")
+            test_condition = st.text_input("Test Condition")
+            comments = st.text_area("Comments")
+            submitted = st.form_submit_button("Submit")
+            if submitted:
+                if not (material_name and value):
+                    st.error("Material name and value are required.")
+                else:
+                    Input_db = pd.DataFrame([{
+                        "material_class": material_class,
+                        "material_name": material_name,
+                        "material_abbreviation": material_abbr,
+                        "section": property_category,
+                        "property_name": property_name,
+                        "value": value,
+                        "unit": unit,
+                        "english_units": english,
+                        "test_condition": test_condition,
+                        "comments": comments
+                    }])
+                    st.success("Property added successfully")
+                    st.dataframe(Input_db)
+                    if "user_uploaded_data" not in st.session_state:
+                        st.session_state["user_uploaded_data"] = Input_db
+                        return
+                    else:
+                        st.session_state["user_uploaded_data"] = pd.concat(
+                        [st.session_state["user_uploaded_data"], Input_db],
+                        ignore_index=True
+                        )
+                return
+def main():
+    st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
+    if 'image_results' not in st.session_state:
+        st.session_state.image_results = []
+    if 'pdf_processed' not in st.session_state:
+        st.session_state.pdf_processed = False
+    if 'current_pdf_name' not in st.session_state:
+        st.session_state.current_pdf_name = None
+    if 'form_submitted' not in st.session_state:
+        st.session_state.form_submitted = False
+    if 'pdf_data_extracted' not in st.session_state:
+        st.session_state.pdf_data_extracted = False
+    if 'pdf_extracted_df' not in st.session_state:
+        st.session_state.pdf_extracted_df = pd.DataFrame()
+    prev_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
+    input_form()
+    curr_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
+    if curr_uploaded_count > prev_uploaded_count:
+        st.session_state.form_submitted = True
+    st.title("PDF Material Data & Plot Extractor")
+    uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
+    if not uploaded_file:
+        st.info("Upload a PDF to extract material data and plots")
+        st.session_state.pdf_processed = False
+        st.session_state.current_pdf_name = None
+        st.session_state.image_results = []
+        st.session_state.form_submitted = False
+        st.session_state.pdf_data_extracted = False
+        st.session_state.pdf_extracted_df = pd.DataFrame()
+        return
+    paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
+    if st.session_state.current_pdf_name != uploaded_file.name:
+        st.session_state.pdf_processed = False
+        st.session_state.current_pdf_name = uploaded_file.name
+        st.session_state.image_results = []
+        st.session_state.form_submitted = False
+    if st.session_state.form_submitted:
+        st.session_state.form_submitted = False
+        st.info("A Form was submitted. But your previous extracted data has been added already. If you want to extract more data/plots" \
+        "upload again")
+        tab1, tab2 = st.tabs(["Material Data", "Extracted Plots"])
+        with tab1:
+            st.info("Material data from form has been added to database.")
+        with tab2:
+            st.info("Plots already extracted")
+        return
+    tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
+    with tempfile.TemporaryDirectory() as tmpdir:
+        pdf_path = os.path.join(tmpdir, uploaded_file.name)
+        with open(pdf_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        with tab1:
+            st.subheader("Material Properties Data")
+            # Only call Gemini once per PDF
+            if not st.session_state.pdf_data_extracted:
+                with st.spinner(" Extracting material data..."):
+                    with open(pdf_path, "rb") as f:
+                        pdf_bytes = f.read()
+                    data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
+                    if data:
+                        df = convert_to_dataframe(data)
+                        if not df.empty:
+                            st.session_state.pdf_extracted_df = df
+                            st.session_state.pdf_data_extracted = True
+                            st.session_state.pdf_extracted_meta = data  # optional: keep raw meta
+                        else:
+                            st.warning("No data extracted")
+                    else:
+                        st.error("Failed to extract data from PDF")
+            # After extraction, or when rerunning, use stored data
+            df = st.session_state.pdf_extracted_df
+            if not df.empty:
+                data = st.session_state.get("pdf_extracted_meta", {})
+                st.success(f" Extracted {len(df)} properties")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.metric("Material", data.get("material_name", "N/A"))
+                with col2:
+                    st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
+                st.dataframe(df, use_container_width=True, height=400)
+                st.subheader("Assign Material Category")
+                extracted_material_class = st.selectbox(
+                    "Select category for this material",
+                    ["Polymer", "Fiber", "Composite"],
+                    index=None,
+                    placeholder="Required before adding to database"
+                )
+                if st.button(" Add to Database"):
+                    if not extracted_material_class:
+                        st.error("Please select a material category before adding.")
+                    else:
+                        df["material_class"] = extracted_material_class
+                        # Optional: add material_type for Page 1 filtering
+                        df["material_type"] = extracted_material_class
+                        if "user_uploaded_data" not in st.session_state:
+                            st.session_state["user_uploaded_data"] = df
+                        else:
+                            st.session_state["user_uploaded_data"] = pd.concat(
+                                [st.session_state["user_uploaded_data"], df],
+                                ignore_index=True
+                            )
+                        st.success(f"Added to {extracted_material_class} database!")
+                csv = df.to_csv(index=False)
+                st.download_button(
+                    "⬇ Download CSV",
+                    data=csv,
+                    file_name=f"{paper_id}_data.csv",
+                    mime="text/csv"
+                )
+        with tab2:
+            st.subheader("Extracted Plot Images")
+            if not st.session_state.pdf_processed:
+                with st.spinner(" Extracting plots from PDF..."):
+                    doc = fitz.open(pdf_path)
+                    st.session_state.image_results = extract_images(doc)
+                    doc.close()
+                    st.session_state.pdf_processed = True
+            if st.session_state.image_results:
+                subtab1, subtab2 = st.tabs([" Images", " JSON Preview"])
+                with subtab1:
+                    st.success(f" Extracted {len(st.session_state.image_results)} plots")
+                    col_img, col_json, col_all = st.columns(3)
+                    with col_img:
+                        img_zip = create_zip(st.session_state.image_results, include_json=False)
+                        st.download_button(
+                            " Download Images Only",
+                            data=img_zip,
+                            file_name=f"{paper_id}_images.zip",
+                            mime="application/zip",
+                            use_container_width=True,
+                            key="download_images"
+                        )
+                    with col_json:
+                        json_data = [{"caption": r["caption"], "page": r["page"],
+                                     "image_count": len(r["image_data"])} for r in st.session_state.image_results]
+                        st.download_button(
+                            " Download JSON",
+                            data=json.dumps(json_data, indent=4),
+                            file_name=f"{paper_id}_metadata.json",
+                            mime="application/json",
+                            use_container_width=True,
+                            key="download_json_top"
+                        )
+                    with col_all:
+                        full_zip = create_zip(st.session_state.image_results, include_json=True)
+                        st.download_button(
+                            " Download All",
+                            data=full_zip,
+                            file_name=f"{paper_id}_complete.zip",
+                            mime="application/zip",
+                            use_container_width=True,
+                            key="download_all"
+                        )
+                    st.divider()
+                    results_copy = st.session_state.image_results.copy()
+                    for idx in range(len(results_copy)):
+                        if idx >= len(st.session_state.image_results):
+                            break
+                        r = st.session_state.image_results[idx]
+                        with st.container(border=True):
+                            col_cap, col_btn = st.columns([0.85, 0.15])
+                            col_cap.markdown(f"**Page {r['page']}**  {r['caption']}")
+                            if col_btn.button(" Delete", key=f"del_g_{idx}_{r['page']}"):
+                                del st.session_state.image_results[idx]
+                                st.rerun()
+                            image_data_list = r['image_data']
+                            if image_data_list and len(image_data_list) > 0:
+                                cols = st.columns(len(image_data_list))
+                                for p_idx in range(len(image_data_list)):
+                                    if p_idx >= len(st.session_state.image_results[idx]['image_data']):
+                                        break
+                                    img_data = st.session_state.image_results[idx]['image_data'][p_idx]
+                                    with cols[p_idx]:
+                                        st.image(img_data['array'], width=img_width, channels="BGR")
+                                        if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
+                                            del st.session_state.image_results[idx]['image_data'][p_idx]
+                                            if len(st.session_state.image_results[idx]['image_data']) == 0:
+                                                del st.session_state.image_results[idx]
+                                            st.rerun()
+                with subtab2:
+                    st.subheader("Metadata Preview")
+                    json_data = [{"caption": r["caption"], "page": r["page"],
+                                 "image_count": len(r["image_data"]),
+                                 "images": [img["filename"] for img in r["image_data"]]}
+                                for r in st.session_state.image_results]
+                    st.download_button(
+                        " Download JSON",
+                        data=json.dumps(json_data, indent=4),
+                        file_name=f"{paper_id}_metadata.json",
+                        mime="application/json",
+                        key="download_json_bottom"
+                    )
+                    st.json(json_data)
+            else:
+                st.warning("No plots found in PDF")
+if __name__ == "__main__":
+    main()

src/pages/categorized/propgraph.jpg ADDED Viewed