Spaces:

aim4composites
/

AIM_InDes_Tool

Sleeping

App Files Files Community

gvlktejaswi commited on Jan 24

Commit

b443b15

verified ·

1 Parent(s): 778dfb3

Delete src/pages

Browse files

Files changed (25) hide show

src/pages/3_Categorized_Search.py +0 -34
src/pages/5_Upload_Data.py +0 -18
src/pages/pages/5_Upload_Data.py +0 -18
src/pages/pages/categorized/Backend/Pdf_DataExtraction.py +0 -120
src/pages/pages/categorized/Backend/Pdf_ImageExtraction.py +0 -390
src/pages/pages/categorized/ESS-min.jpg +0 -3
src/pages/pages/categorized/Temp_Backup.py +0 -736
src/pages/pages/categorized/__pycache__/page1.cpython-312.pyc +0 -0
src/pages/pages/categorized/__pycache__/page1.cpython-313.pyc +0 -0
src/pages/pages/categorized/__pycache__/page1.cpython-314.pyc +0 -0
src/pages/pages/categorized/__pycache__/page2.cpython-312.pyc +0 -0
src/pages/pages/categorized/__pycache__/page2.cpython-313.pyc +0 -0
src/pages/pages/categorized/__pycache__/page2.cpython-314.pyc +0 -0
src/pages/pages/categorized/__pycache__/page3.cpython-313.pyc +0 -0
src/pages/pages/categorized/__pycache__/page3.cpython-314.pyc +0 -0
src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc +0 -0
src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc.2029864538672 +0 -0
src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc.2097035857760 +0 -0
src/pages/pages/categorized/page1.py +0 -307
src/pages/pages/categorized/page2.py +0 -265
src/pages/pages/categorized/page3.py +0 -62
src/pages/pages/categorized/page4.py +0 -5
src/pages/pages/categorized/page5.py +0 -5
src/pages/pages/categorized/page6.py +0 -671
src/pages/pages/categorized/propgraph.jpg +0 -0

src/pages/3_Categorized_Search.py DELETED Viewed

@@ -1,34 +0,0 @@
-import streamlit as st
-from PIL import Image # Used to open and handle image files
-def load_page1():
-    from pages.categorized.page1 import main
-    main()
-# def load_page2():
-#     from pages.categorized.page2 import main
-#     main()
-load_page1()
-#st.sidebar.button('Material Type', on_click=load_page1)
-#st.sidebar.button('Trade Name', on_click=load_page2)
-#st.sidebar.button('Manufacturer Name', on_click=load_page3)
-#image = Image.open('logo.png')
-#st.image(image, caption='a', use_container_width=True)
-st.sidebar.write("")
-st.sidebar.write("")
-st.sidebar.write("")
-st.sidebar.write("")
-st.sidebar.write("")
-st.sidebar.write("")
-st.sidebar.write("")
-st.sidebar.write("")
-st.sidebar.image("logo.png", caption=" ", width=150)

src/pages/5_Upload_Data.py DELETED Viewed

@@ -1,18 +0,0 @@
-import streamlit as st
-from PIL import Image
-# def load_page1():
-#     from pages.categorized.page1 import main
-#     main()
-def load_page6():
-    from pages.categorized.page6 import main
-    main()
-def load_page3():
-    from pages.categorized.page3 import main
-    main()
-load_page6()
-#load_page3()

src/pages/pages/5_Upload_Data.py DELETED Viewed

@@ -1,18 +0,0 @@
-import streamlit as st
-from PIL import Image
-# def load_page1():
-#     from pages.categorized.page1 import main
-#     main()
-def load_page6():
-    from pages.pages.categorized.page6 import main
-    main()
-def load_page3():
-    from pages.pages.categorized.page3 import main
-    main()
-load_page6()
-#load_page3()

src/pages/pages/categorized/Backend/Pdf_DataExtraction.py DELETED Viewed

@@ -1,120 +0,0 @@
-import streamlit as st
-import pandas as pd
-from PIL import Image
-import requests
-import base64
-import json
-import os
-from typing import Dict, Any, Optional
-# Backend PDF extraction Logic
-API_KEY = "AIzaSyAruLR2WyiaL9PquOXOhHF4wMn7tfYZWek"
-API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
-SCHEMA = {
-    "type": "OBJECT",
-    "properties": {
-        "material_name": {"type": "STRING"},
-        "material_abbreviation": {"type": "STRING"},
-        "mechanical_properties": {
-            "type": "ARRAY",
-            "items": {
-                "type": "OBJECT",
-                "properties": {
-                    "section": {"type": "STRING"},
-                    "property_name": {"type": "STRING"},
-                    "value": {"type": "STRING"},
-                    "unit": {"type": "STRING"},
-                    "english": {"type": "STRING"},
-                    "test_condition": {"type": "STRING"},
-                    "comments": {"type": "STRING"}
-                },
-                "required": ["section", "property_name", "value", "english", "comments"]
-            }
-        }
-    }
-}
-# === GEMINI CALL FUNCTION ===
-def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
-    """Calls Gemini API with PDF bytes"""
-    try:
-        encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
-        mime_type = "application/pdf"
-    except Exception as e:
-        st.error(f"Error encoding PDF: {e}")
-        return None
-    prompt = (
-         "Extract all experimental data from this research paper. "
-         "For each measurement, extract: "
-         "- experiment_name, measured_value, unit, uncertainty, method, conditions. "
-         "Return as JSON."
-        # "You are an expert materials scientist. From the attached PDF, extract the material name, "
-        # "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
-        # "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
-        # "For each property, you MUST extract:\n"
-        # "- property_name\n- value (or range)\n- unit\n"
-        # "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
-        # "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
-        # "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
-    )
-    payload = {
-        "contents": [
-            {
-                "parts": [
-                    {"text": prompt},
-                    {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
-                ]
-            }
-        ],
-        "generationConfig": {
-            "temperature": 0,
-            "responseMimeType": "application/json",
-            "responseSchema": SCHEMA
-        }
-    }
-    try:
-        r = requests.post(API_URL, json=payload, timeout=300)
-        r.raise_for_status()
-        data = r.json()
-        candidates = data.get("candidates", [])
-        if not candidates:
-            return None
-        parts = candidates[0].get("content", {}).get("parts", [])
-        json_text = None
-        for p in parts:
-            t = p.get("text", "")
-            if t.strip().startswith("{"):
-                json_text = t
-                break
-        return json.loads(json_text) if json_text else None
-    except Exception as e:
-        st.error(f"Gemini API Error: {e}")
-        return None
-def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
-    """Convert extracted JSON to DataFrame"""
-    rows = []
-    for item in data.get("mechanical_properties", []):
-        rows.append({
-            "material_name": data.get("material_name", ""),
-            "material_abbreviation": data.get("material_abbreviation", ""),
-            "section": item.get("section", ""),
-            "property_name": item.get("property_name", ""),
-            "value": item.get("value", ""),
-            "unit": item.get("unit", ""),
-            "english": item.get("english", ""),
-            "test_condition": item.get("test_condition", ""),
-            "comments": item.get("comments", "")
-        })
-    return pd.DataFrame(rows)

src/pages/pages/categorized/Backend/Pdf_ImageExtraction.py DELETED Viewed

@@ -1,390 +0,0 @@
-import os
-import re
-import json
-import math
-import tempfile
-import fitz  # PyMuPDF
-import cv2
-import numpy as np
-from PIL import Image
-import streamlit as st
-# -------------------
-# Config
-# -------------------
-DPI = 300
-OUT_DIR = "outputs"
-KEEP_ONLY_STRESS_STRAIN = False
-CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
-SS_KW  = re.compile(
-    r"(stress\s*[-–]?\s*strain|stress|strain|tensile|MPa|GPa|kN|yield|elongation)",
-    re.IGNORECASE
-)
-# -------------------
-# Render helpers
-# -------------------
-def render_page(page, dpi=DPI):
-    mat = fitz.Matrix(dpi/72, dpi/72)
-    pix = page.get_pixmap(matrix=mat, alpha=False)
-    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-    return img, mat
-def pdf_to_px_bbox(bbox_pdf, mat):
-    x0, y0, x1, y1 = bbox_pdf
-    sx, sy = mat.a, mat.d
-    return (int(float(x0) * sx), int(float(y0) * sy), int(float(x1) * sx), int(float(y1) * sy))
-def safe_crop_px(pil_img, box):
-    if not isinstance(box, (tuple, list)):
-        return None
-    if len(box) == 1 and isinstance(box[0], (tuple, list)) and len(box[0]) == 4:
-        box = box[0]
-    if len(box) != 4:
-        return None
-    x0, y0, x1, y1 = box
-    if any(isinstance(v, (tuple, list)) for v in (x0, y0, x1, y1)):
-        return None
-    try:
-        x0 = int(x0)
-        y0 = int(y0)
-        x1 = int(x1)
-        y1 = int(y1)
-    except (TypeError, ValueError):
-        return None
-    if x1 < x0:
-        x0, x1 = x1, x0
-    if y1 < y0:
-        y0, y1 = y1, y0
-    W, H = pil_img.size
-    x0 = max(0, min(W, x0))
-    x1 = max(0, min(W, x1))
-    y0 = max(0, min(H, y0))
-    y1 = max(0, min(H, y1))
-    if x1 <= x0 or y1 <= y0:
-        return None
-    return pil_img.crop((x0, y0, x1, y1))
-# -------------------
-# Captions
-# -------------------
-def find_caption_blocks(page):
-    caps = []
-    blocks = page.get_text("blocks")
-    for b in blocks:
-        x0, y0, x1, y1, text = b[0], b[1], b[2], b[3], b[4]
-        t = " ".join(str(text).strip().split())
-        if CAP_RE.match(t):
-            caps.append({"bbox": (x0, y0, x1, y1), "text": t})
-    return caps
-# -------------------
-# Dedupe: dHash
-# -------------------
-def dhash64(pil_img):
-    gray = pil_img.convert("L").resize((9, 8), Image.LANCZOS)
-    pixels = list(gray.getdata())
-    bits = 0
-    for r in range(8):
-        for c in range(8):
-            left = pixels[r * 9 + c]
-            right = pixels[r * 9 + c + 1]
-            bits = (bits << 1) | (1 if left > right else 0)
-    return bits
-# -------------------
-# Rejectors
-# -------------------
-def has_colorbar_like_strip(pil_img):
-    img = np.array(pil_img)
-    if img.ndim != 3:
-        return False
-    H, W, _ = img.shape
-    if W < 250 or H < 150:
-        return False
-    strip_w = max(18, int(0.07 * W))
-    strip = img[:, W-strip_w:W, :]
-    q = (strip // 24).reshape(-1, 3)
-    uniq = np.unique(q, axis=0)
-    return len(uniq) > 70
-def texture_score(pil_img):
-    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
-    lap = cv2.Laplacian(gray, cv2.CV_64F)
-    return float(lap.var())
-def is_mostly_legend(pil_img):
-    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
-    bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-    bw = cv2.medianBlur(bw, 3)
-    H, W = bw.shape
-    fill = float(np.count_nonzero(bw)) / float(H * W)
-    return (0.03 < fill < 0.18) and (min(H, W) < 260)
-# -------------------
-# Plot detection
-# -------------------
-def detect_axes_lines(pil_img):
-    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
-    edges = cv2.Canny(gray, 50, 150)
-    H, W = gray.shape
-    min_len = int(0.28 * min(H, W))
-    lines = cv2.HoughLinesP(
-        edges, 1, np.pi/180,
-        threshold=90,
-        minLineLength=min_len,
-        maxLineGap=14
-    )
-    if lines is None:
-        return None, None
-    horizontals, verticals = [], []
-    for x1, y1, x2, y2 in lines[:, 0]:
-        dx, dy = abs(x2-x1), abs(y2-y1)
-        length = math.hypot(dx, dy)
-        if dy < 18 and dx > 0.35 * W:
-            horizontals.append((length, (x1, y1, x2, y2)))
-        if dx < 18 and dy > 0.35 * H:
-            verticals.append((length, (x1, y1, x2, y2)))
-    if not horizontals or not verticals:
-        return None, None
-    horizontals.sort(key=lambda t: t[0], reverse=True)
-    verticals.sort(key=lambda t: t[0], reverse=True)
-    return horizontals[0][1], verticals[0][1]
-def axis_intersection_ok(x_axis, y_axis, W, H):
-    xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
-    ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
-    if not (0 <= xa_y < H and 0 <= ya_x < W):
-        return False
-    if ya_x > int(0.95 * W) or xa_y < int(0.05 * H):
-        return False
-    return True
-def tick_text_presence_score(pil_img, x_axis, y_axis):
-    img = np.array(pil_img)
-    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
-    bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-    bw = cv2.medianBlur(bw, 3)
-    H, W = gray.shape
-    xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
-    ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
-    y0a = max(0, xa_y - 40)
-    y1a = min(H, xa_y + 110)
-    x_roi = bw[y0a:y1a, 0:W]
-    x0b = max(0, ya_x - 180)
-    x1b = min(W, ya_x + 50)
-    y_roi = bw[0:H, x0b:x1b]
-    def count_small_components(mask):
-        num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
-        cnt = 0
-        for i in range(1, num):
-            x, y, w, h, area = stats[i]
-            if 4 <= w <= 150 and 4 <= h <= 150 and 20 <= area <= 5000:
-                cnt += 1
-        return cnt
-    return count_small_components(x_roi) + count_small_components(y_roi)
-def is_real_plot(pil_img):
-    if has_colorbar_like_strip(pil_img):
-        return False
-    if is_mostly_legend(pil_img):
-        return False
-    x_axis, y_axis = detect_axes_lines(pil_img)
-    if x_axis is None or y_axis is None:
-        return False
-    arr = np.array(pil_img)
-    H, W = arr.shape[0], arr.shape[1]
-    if not axis_intersection_ok(x_axis, y_axis, W, H):
-        return False
-    if texture_score(pil_img) > 2200:
-        return False
-    score = tick_text_presence_score(pil_img, x_axis, y_axis)
-    return score >= 18
-# -------------------
-# Candidate boxes in a region
-# -------------------
-def connected_components_boxes(pil_img):
-    img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
-    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-    mask = (gray < 245).astype(np.uint8) * 255
-    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((7, 7), np.uint8), iterations=2)
-    num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
-    boxes = []
-    for i in range(1, num):
-        x, y, w, h, area = stats[i]
-        boxes.append((int(area), (int(x), int(y), int(x + w), int(y + h))))
-    boxes.sort(key=lambda t: t[0], reverse=True)
-    return boxes
-def expand_box(box, W, H, left=0.10, right=0.06, top=0.06, bottom=0.18):
-    x0, y0, x1, y1 = box
-    bw = x1 - x0
-    bh = y1 - y0
-    ex0 = max(0, int(x0 - left * bw))
-    ex1 = min(W, int(x1 + right * bw))
-    ey0 = max(0, int(y0 - top * bh))
-    ey1 = min(H, int(y1 + bottom * bh))
-    return (ex0, ey0, ex1, ey1)
-# -------------------
-# Crop plot from caption
-# -------------------
-def crop_plot_from_caption(page_img, cap_bbox_pdf, mat):
-    cap_px = pdf_to_px_bbox(cap_bbox_pdf, mat)
-    cap_y0 = cap_px[1]
-    cap_y1 = cap_px[3]
-    W, H = page_img.size
-    search_top = max(0, cap_y0 - int(0.95 * H))
-    search_bot = min(H, cap_y1 + int(0.20 * H))
-    region = safe_crop_px(page_img, (0, search_top, W, search_bot))
-    if region is None:
-        return None
-    comps = connected_components_boxes(region)
-    best = None
-    best_area = -1
-    for area, box in comps[:35]:
-        x0, y0, x1, y1 = box
-        bw = x1 - x0
-        bh = y1 - y0
-        if bw < 220 or bh < 180:
-            continue
-        exp = expand_box(box, region.size[0], region.size[1])
-        cand = safe_crop_px(region, exp)
-        if cand is None:
-            continue
-        if not is_real_plot(cand):
-            continue
-        if area > best_area:
-            best_area = area
-            best = cand
-    return best
-# -------------------
-# Streamlit UI
-# -------------------
-def run_extraction(pdf_path, paper_id="uploaded_paper"):
-    out_paper = os.path.join(OUT_DIR, paper_id)
-    out_imgs = os.path.join(out_paper, "plots_with_axes")
-    os.makedirs(out_imgs, exist_ok=True)
-    doc = fitz.open(pdf_path)
-    results = []
-    seen = set()
-    saved = 0
-    for p in range(len(doc)):
-        page = doc[p]
-        caps = find_caption_blocks(page)
-        if not caps:
-            continue
-        page_img, mat = render_page(page, dpi=DPI)
-        for cap in caps:
-            cap_text = cap["text"]
-            if KEEP_ONLY_STRESS_STRAIN and not SS_KW.search(cap_text):
-                continue
-            fig = crop_plot_from_caption(page_img, cap["bbox"], mat)
-            if fig is None:
-                continue
-            if fig.size[0] > 8 and fig.size[1] > 8:
-                fig = fig.crop((2, 2, fig.size[0]-2, fig.size[1]-2))
-            try:
-                h = dhash64(fig)
-            except Exception:
-                continue
-            if h in seen:
-                continue
-            seen.add(h)
-            img_name = f"p{p+1:02d}_{saved:04d}.png"
-            img_path = os.path.join(out_imgs, img_name)
-            fig.save(img_path)
-            results.append({
-                "page": p + 1,
-                "caption": cap_text,
-                "image": img_path
-            })
-            saved += 1
-    out_json = os.path.join(out_paper, "plots_with_axes.json")
-    with open(out_json, "w", encoding="utf-8") as f:
-        json.dump(results, f, indent=2, ensure_ascii=False)
-    return results, out_json
-def main():
-    st.set_page_config(page_title="Research Paper Plot Extractor", layout="wide")
-    st.title(" Plot Extractor (Upload PDF)")
-    uploaded = st.file_uploader("Upload a research paper PDF", type=["pdf"])
-    if not uploaded:
-        st.info("Upload a PDF to extract plots.")
-        return
-    paper_id = os.path.splitext(uploaded.name)[0].replace(" ", "_")
-    with tempfile.TemporaryDirectory() as tmpdir:
-        pdf_path = os.path.join(tmpdir, uploaded.name)
-        with open(pdf_path, "wb") as f:
-            f.write(uploaded.read())
-        with st.spinner("Extracting plots..."):
-            results, out_json = run_extraction(pdf_path, paper_id=paper_id)
-        st.success(f"Extracted {len(results)} plots.")
-        # Show images + captions
-        for r in results:
-            st.markdown(f"**Page {r['page']}** — {r['caption']}")
-            st.image(r["image"], use_container_width=True)
-            st.divider()
-        # JSON viewer + download
-        st.subheader("JSON Output")
-        st.json(results)
-        with open(out_json, "rb") as f:
-            st.download_button(
-                "Download JSON",
-                data=f,
-                file_name=os.path.basename(out_json),
-                mime="application/json"
-            )
-if __name__ == "__main__":
-    main()

src/pages/pages/categorized/ESS-min.jpg DELETED Viewed

Git LFS Details

SHA256: ff58c9304c39dc90ca15b516a1f1ec385ea60a9829c5dd9eb698ee1f82778eb7
Pointer size: 131 Bytes
Size of remote file: 356 kB

src/pages/pages/categorized/Temp_Backup.py DELETED Viewed

@@ -1,736 +0,0 @@
-import os
-import re
-import json
-import math
-import tempfile
-import fitz  # PyMuPDF
-import cv2
-import numpy as np
-from PIL import Image
-import streamlit as st
-import pandas as pd
-import requests
-import base64
-from typing import Dict, Any, Optional
-API_KEY = "AIzaSyAruLR2WyiaL9PquOXOhHF4wMn7tfYZWek"
-API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
-SCHEMA = {
-    "type": "OBJECT",
-    "properties": {
-        "material_name": {"type": "STRING"},
-        "material_abbreviation": {"type": "STRING"},
-        "mechanical_properties": {
-            "type": "ARRAY",
-            "items": {
-                "type": "OBJECT",
-                "properties": {
-                    "section": {"type": "STRING"},
-                    "property_name": {"type": "STRING"},
-                    "value": {"type": "STRING"},
-                    "unit": {"type": "STRING"},
-                    "english": {"type": "STRING"},
-                    "test_condition": {"type": "STRING"},
-                    "comments": {"type": "STRING"}
-                },
-                "required": ["section", "property_name", "value", "english", "comments"]
-            }
-        }
-    }
-}
-def make_abbreviation(name: str) -> str:
-    """Create a simple abbreviation from the material name."""
-    if not name:
-        return "UNKNOWN"
-    words = name.split()
-    abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
-    return abbr or name[:6].upper()
-DPI = 300
-OUT_DIR = "outputs"
-KEEP_ONLY_STRESS_STRAIN = False
-CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
-SS_KW = re.compile(
-    r"(stress\s*[-–]?\s*strain|stress|strain|tensile|MPa|GPa|kN|yield|elongation)",
-    re.IGNORECASE
-)
-def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
-    """Calls Gemini API with PDF bytes"""
-    try:
-        encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
-        mime_type = "application/pdf"
-    except Exception as e:
-        st.error(f"Error encoding PDF: {e}")
-        return None
-    prompt = (
-        "You are an expert materials scientist. From the attached PDF, extract the material name, "
-        "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
-        "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
-        "For each property, you MUST extract:\n"
-        "- property_name\n- value (or range)\n- unit\n"
-        "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
-        "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
-        "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
-    )
-    payload = {
-        "contents": [{
-            "parts": [
-                {"text": prompt},
-                {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
-            ]
-        }],
-        "generationConfig": {
-            "temperature": 0,
-            "responseMimeType": "application/json",
-            "responseSchema": SCHEMA
-        }
-    }
-    try:
-        r = requests.post(API_URL, json=payload, timeout=300)
-        r.raise_for_status()
-        data = r.json()
-        candidates = data.get("candidates", [])
-        if not candidates:
-            return None
-        parts = candidates[0].get("content", {}).get("parts", [])
-        json_text = None
-        for p in parts:
-            t = p.get("text", "")
-            if t.strip().startswith("{"):
-                json_text = t
-                break
-        return json.loads(json_text) if json_text else None
-    except Exception as e:
-        st.error(f"Gemini API Error: {e}")
-        return None
-# def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
-#     """Convert extracted JSON to DataFrame"""
-#     rows = []
-#     for item in data.get("mechanical_properties", []):
-#         rows.append({
-#             "material_name": data.get("material_name", ""),
-#             "material_abbreviation": data.get("material_abbreviation", ""),
-#             "section": item.get("section", ""),
-#             "property_name": item.get("property_name", ""),
-#             "value": item.get("value", ""),
-#             "unit": item.get("unit", ""),
-#             "english": item.get("english", ""),
-#             "test_condition": item.get("test_condition", ""),
-#             "comments": item.get("comments", "")
-#         })
-#     return pd.DataFrame(rows)
-def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
-    """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
-    mat_name = data.get("material_name", "") or ""
-    mat_abbr = data.get("material_abbreviation", "") or ""
-    if not mat_abbr:
-        mat_abbr = make_abbreviation(mat_name)
-    rows = []
-    for item in data.get("mechanical_properties", []):
-        rows.append({
-            "material_name": mat_name,
-            "material_abbreviation": mat_abbr,
-            "section": item.get("section", "") or "Mechanical",
-            "property_name": item.get("property_name", "") or "Unknown property",
-            "value": item.get("value", "") or "N/A",
-            "unit": item.get("unit", "") or "",
-            "english": item.get("english", "") or "",
-            "test_condition": item.get("test_condition", "") or "",
-            "comments": item.get("comments", "") or "",
-        })
-    return pd.DataFrame(rows)
-def render_page(page, dpi=DPI):
-    mat = fitz.Matrix(dpi/72, dpi/72)
-    pix = page.get_pixmap(matrix=mat, alpha=False)
-    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-    return img, mat
-def pdf_to_px_bbox(bbox_pdf, mat):
-    x0, y0, x1, y1 = bbox_pdf
-    sx, sy = mat.a, mat.d
-    return (int(float(x0) * sx), int(float(y0) * sy), int(float(x1) * sx), int(float(y1) * sy))
-def safe_crop_px(pil_img, box):
-    if not isinstance(box, (tuple, list)):
-        return None
-    if len(box) == 1 and isinstance(box[0], (tuple, list)) and len(box[0]) == 4:
-        box = box[0]
-    if len(box) != 4:
-        return None
-    x0, y0, x1, y1 = box
-    if any(isinstance(v, (tuple, list)) for v in (x0, y0, x1, y1)):
-        return None
-    try:
-        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
-    except (TypeError, ValueError):
-        return None
-    if x1 < x0: x0, x1 = x1, x0
-    if y1 < y0: y0, y1 = y1, y0
-    W, H = pil_img.size
-    x0 = max(0, min(W, x0))
-    x1 = max(0, min(W, x1))
-    y0 = max(0, min(H, y0))
-    y1 = max(0, min(H, y1))
-    if x1 <= x0 or y1 <= y0:
-        return None
-    return pil_img.crop((x0, y0, x1, y1))
-def find_caption_blocks(page):
-    caps = []
-    blocks = page.get_text("blocks")
-    for b in blocks:
-        x0, y0, x1, y1, text = b[0], b[1], b[2], b[3], b[4]
-        t = " ".join(str(text).strip().split())
-        if CAP_RE.match(t):
-            caps.append({"bbox": (x0, y0, x1, y1), "text": t})
-    return caps
-def dhash64(pil_img):
-    gray = pil_img.convert("L").resize((9, 8), Image.LANCZOS)
-    pixels = list(gray.getdata())
-    bits = 0
-    for r in range(8):
-        for c in range(8):
-            left = pixels[r * 9 + c]
-            right = pixels[r * 9 + c + 1]
-            bits = (bits << 1) | (1 if left > right else 0)
-    return bits
-def has_colorbar_like_strip(pil_img):
-    img = np.array(pil_img)
-    if img.ndim != 3:
-        return False
-    H, W, _ = img.shape
-    if W < 250 or H < 150:
-        return False
-    strip_w = max(18, int(0.07 * W))
-    strip = img[:, W-strip_w:W, :]
-    q = (strip // 24).reshape(-1, 3)
-    uniq = np.unique(q, axis=0)
-    return len(uniq) > 70
-def texture_score(pil_img):
-    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
-    lap = cv2.Laplacian(gray, cv2.CV_64F)
-    return float(lap.var())
-def is_mostly_legend(pil_img):
-    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
-    bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-    bw = cv2.medianBlur(bw, 3)
-    H, W = bw.shape
-    fill = float(np.count_nonzero(bw)) / float(H * W)
-    return (0.03 < fill < 0.18) and (min(H, W) < 260)
-def detect_axes_lines(pil_img):
-    gray = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2GRAY)
-    edges = cv2.Canny(gray, 50, 150)
-    H, W = gray.shape
-    min_len = int(0.28 * min(H, W))
-    lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=90, minLineLength=min_len, maxLineGap=14)
-    if lines is None:
-        return None, None
-    horizontals, verticals = [], []
-    for x1, y1, x2, y2 in lines[:, 0]:
-        dx, dy = abs(x2-x1), abs(y2-y1)
-        length = math.hypot(dx, dy)
-        if dy < 18 and dx > 0.35 * W:
-            horizontals.append((length, (x1, y1, x2, y2)))
-        if dx < 18 and dy > 0.35 * H:
-            verticals.append((length, (x1, y1, x2, y2)))
-    if not horizontals or not verticals:
-        return None, None
-    horizontals.sort(key=lambda t: t[0], reverse=True)
-    verticals.sort(key=lambda t: t[0], reverse=True)
-    return horizontals[0][1], verticals[0][1]
-def axis_intersection_ok(x_axis, y_axis, W, H):
-    xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
-    ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
-    if not (0 <= xa_y < H and 0 <= ya_x < W):
-        return False
-    if ya_x > int(0.95 * W) or xa_y < int(0.05 * H):
-        return False
-    return True
-def tick_text_presence_score(pil_img, x_axis, y_axis):
-    img = np.array(pil_img)
-    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
-    bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-    bw = cv2.medianBlur(bw, 3)
-    H, W = gray.shape
-    xa_y = int(round((x_axis[1] + x_axis[3]) / 2))
-    ya_x = int(round((y_axis[0] + y_axis[2]) / 2))
-    y0a = max(0, xa_y - 40)
-    y1a = min(H, xa_y + 110)
-    x_roi = bw[y0a:y1a, 0:W]
-    x0b = max(0, ya_x - 180)
-    x1b = min(W, ya_x + 50)
-    y_roi = bw[0:H, x0b:x1b]
-    def count_small_components(mask):
-        num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
-        cnt = 0
-        for i in range(1, num):
-            x, y, w, h, area = stats[i]
-            if 4 <= w <= 150 and 4 <= h <= 150 and 20 <= area <= 5000:
-                cnt += 1
-        return cnt
-    return count_small_components(x_roi) + count_small_components(y_roi)
-def is_real_plot(pil_img):
-    if has_colorbar_like_strip(pil_img):
-        return False
-    if is_mostly_legend(pil_img):
-        return False
-    x_axis, y_axis = detect_axes_lines(pil_img)
-    if x_axis is None or y_axis is None:
-        return False
-    arr = np.array(pil_img)
-    H, W = arr.shape[0], arr.shape[1]
-    if not axis_intersection_ok(x_axis, y_axis, W, H):
-        return False
-    if texture_score(pil_img) > 2200:
-        return False
-    score = tick_text_presence_score(pil_img, x_axis, y_axis)
-    return score >= 18
-def connected_components_boxes(pil_img):
-    img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
-    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-    mask = (gray < 245).astype(np.uint8) * 255
-    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((7, 7), np.uint8), iterations=2)
-    num, _, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
-    boxes = []
-    for i in range(1, num):
-        x, y, w, h, area = stats[i]
-        boxes.append((int(area), (int(x), int(y), int(x + w), int(y + h))))
-    boxes.sort(key=lambda t: t[0], reverse=True)
-    return boxes
-def expand_box(box, W, H, left=0.10, right=0.06, top=0.06, bottom=0.18):
-    x0, y0, x1, y1 = box
-    bw = x1 - x0
-    bh = y1 - y0
-    ex0 = max(0, int(x0 - left * bw))
-    ex1 = min(W, int(x1 + right * bw))
-    ey0 = max(0, int(y0 - top * bh))
-    ey1 = min(H, int(y1 + bottom * bh))
-    return (ex0, ey0, ex1, ey1)
-def crop_plot_from_caption(page_img, cap_bbox_pdf, mat):
-    cap_px = pdf_to_px_bbox(cap_bbox_pdf, mat)
-    cap_y0 = cap_px[1]
-    cap_y1 = cap_px[3]
-    W, H = page_img.size
-    search_top = max(0, cap_y0 - int(0.95 * H))
-    search_bot = min(H, cap_y1 + int(0.20 * H))
-    region = safe_crop_px(page_img, (0, search_top, W, search_bot))
-    if region is None:
-        return None
-    comps = connected_components_boxes(region)
-    best = None
-    best_area = -1
-    for area, box in comps[:35]:
-        x0, y0, x1, y1 = box
-        bw = x1 - x0
-        bh = y1 - y0
-        if bw < 220 or bh < 180:
-            continue
-        exp = expand_box(box, region.size[0], region.size[1])
-        cand = safe_crop_px(region, exp)
-        if cand is None:
-            continue
-        if not is_real_plot(cand):
-            continue
-        if area > best_area:
-            best_area = area
-            best = cand
-    return best
-def extract_images(pdf_path, paper_id="uploaded_paper"):
-    """Extract plot images from PDF"""
-    out_paper = os.path.join(OUT_DIR, paper_id)
-    out_imgs = os.path.join(out_paper, "plots_with_axes")
-    os.makedirs(out_imgs, exist_ok=True)
-    doc = fitz.open(pdf_path)
-    results = []
-    seen = set()
-    saved = 0
-    for p in range(len(doc)):
-        page = doc[p]
-        caps = find_caption_blocks(page)
-        if not caps:
-            continue
-        page_img, mat = render_page(page, dpi=DPI)
-        for cap in caps:
-            cap_text = cap["text"]
-            if KEEP_ONLY_STRESS_STRAIN and not SS_KW.search(cap_text):
-                continue
-            fig = crop_plot_from_caption(page_img, cap["bbox"], mat)
-            if fig is None:
-                continue
-            if fig.size[0] > 8 and fig.size[1] > 8:
-                fig = fig.crop((2, 2, fig.size[0]-2, fig.size[1]-2))
-            try:
-                h = dhash64(fig)
-            except Exception:
-                continue
-            if h in seen:
-                continue
-            seen.add(h)
-            img_name = f"p{p+1:02d}_{saved:04d}.png"
-            img_path = os.path.join(out_imgs, img_name)
-            fig.save(img_path)
-            results.append({
-                "page": p + 1,
-                "caption": cap_text,
-                "image": img_path
-            })
-            saved += 1
-    return results
-def input_form():
-    PROPERTY_CATEGORIES = {
-        "Polymer": [
-            "Thermal",
-            "Mechanical",
-            "Processing",
-            "Physical",
-            "Descriptive",
-        ],
-        "Fiber": [
-            "Mechanical",
-            "Physical",
-            "Thermal",
-            "Descriptive",
-        ],
-        "Composite": [
-            "Mechanical",
-            "Thermal",
-            "Processing",
-            "Physical",
-            "Descriptive",
-            "Composition / Reinforcement",
-            "Architecture / Structure",
-        ],
-    }
-    PROPERTY_NAMES = {
-        "Polymer": {
-            "Thermal": [
-                "Glass transition temperature (Tg)",
-                "Melting temperature (Tm)",
-                "Crystallization temperature (Tc)",
-                "Degree of crystallinity",
-                "Decomposition temperature",
-            ],
-            "Mechanical": [
-                "Tensile modulus",
-                "Tensile strength",
-                "Elongation at break",
-                "Flexural modulus",
-                "Impact strength",
-            ],
-            "Processing": [
-                "Melt flow index (MFI)",
-                "Processing temperature",
-                "Cooling rate",
-                "Mold shrinkage",
-            ],
-            "Physical": [
-                "Density",
-                "Specific gravity",
-            ],
-            "Descriptive": [
-                "Material grade",
-                "Manufacturer",
-            ],
-        },
-        "Fiber": {
-            "Mechanical": [
-                "Tensile modulus",
-                "Tensile strength",
-                "Strain to failure",
-            ],
-            "Physical": [
-                "Density",
-                "Fiber diameter",
-            ],
-            "Thermal": [
-                "Decomposition temperature",
-            ],
-            "Descriptive": [
-                "Fiber type",
-                "Surface treatment",
-            ],
-        },
-        "Composite": {
-            "Mechanical": [
-                "Longitudinal modulus (E1)",
-                "Transverse modulus (E2)",
-                "Shear modulus (G12)",
-                "Poissons ratio (V12)",
-                "Tensile strength (fiber direction)",
-                "Interlaminar shear strength",
-            ],
-            "Thermal": [
-                "Glass transition temperature (matrix)",
-                "Coefficient of thermal expansion (CTE)",
-            ],
-            "Processing": [
-                "Curing temperature",
-                "Curing pressure",
-            ],
-            "Physical": [
-                "Density",
-            ],
-            "Descriptive": [
-                "Laminate type",
-            ],
-            "Composition / Reinforcement": [
-                "Fiber volume fraction",
-                "Fiber weight fraction",
-                "Fiber type",
-                "Matrix type",
-            ],
-            "Architecture / Structure": [
-                "Weave type",
-                "Ply orientation",
-                "Number of plies",
-                "Stacking sequence",
-            ],
-        },
-    }
-    st.title("Materials Property Input Form")
-    material_class = st.selectbox(
-        "Select Material Class",
-        ("Polymer", "Fiber", "Composite"),
-        index=None,
-        placeholder="Choose material class",
-    )
-    if material_class:
-        property_category = st.selectbox(
-            "Select Property Category",
-            PROPERTY_CATEGORIES[material_class],
-            index=None,
-            placeholder="Choose property category",
-        )
-    else:
-        property_category = None
-    if material_class and property_category:
-        property_name = st.selectbox(
-            "Select Property",
-            PROPERTY_NAMES[material_class][property_category],
-            index=None,
-            placeholder="Choose property",
-        )
-    else:
-        property_name = None
-    if material_class and property_category and property_name:
-        with st.form("user_input"):
-            st.subheader("Enter Data")
-            material_name = st.text_input("Material Name")
-            material_abbr = st.text_input("Material Abbreviation")
-            value = st.text_input("Value")
-            unit = st.text_input("Unit (SI)")
-            english = st.text_input("English Units")
-            test_condition = st.text_input("Test Condition")
-            comments = st.text_area("Comments")
-            submitted = st.form_submit_button("Submit")
-            if submitted:
-                if not (material_name and value):
-                    st.error("Material name and value are required.")
-                else:
-                    Input_db = pd.DataFrame([{
-                        "material_class": material_class,
-                        "material_name": material_name,
-                        "material_abbreviation": material_abbr,
-                        "section": property_category,
-                        "property_name": property_name,
-                        "value": value,
-                        "unit": unit,
-                        "english_units": english,
-                        "test_condition": test_condition,
-                        "comments": comments
-                    }])
-                    st.success("Property added successfully")
-                    st.dataframe(Input_db)
-                    if "user_uploaded_data" not in st.session_state:
-                        st.session_state["user_uploaded_data"] = Input_db
-                    else:
-                        st.session_state["user_uploaded_data"] = pd.concat(
-                        [st.session_state["user_uploaded_data"], Input_db],
-                        ignore_index=True
-                        )
-def main():
-    input_form()
-    st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
-    st.title("PDF Material Data & Plot Extractor")
-    uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
-    if not uploaded_file:
-        st.info("Upload a PDF to extract material data and plots")
-        return
-    paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
-    tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
-    with tempfile.TemporaryDirectory() as tmpdir:
-        pdf_path = os.path.join(tmpdir, uploaded_file.name)
-        with open(pdf_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        with tab1:
-            st.subheader("Material Properties Data")
-            with st.spinner(" Extracting material data..."):
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-                data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
-                if data:
-                    df = convert_to_dataframe(data)
-                    if not df.empty:
-                        st.success(f"Extracted {len(df)} properties")
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            st.metric("Material", data.get("material_name", "N/A"))
-                        with col2:
-                            st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
-                        st.dataframe(df, use_container_width=True, height=400)
-                        st.subheader("Assign Material Category")
-                        extracted_material_class = st.selectbox(
-                            "Select category for this material",
-                            ["Polymer", "Fiber", "Composite"],
-                            index=None,
-                            placeholder="Required before adding to database"
-                        )
-                        if st.button(" Add to Database"):
-                            if not extracted_material_class:
-                                st.error("Please select a material category before adding.")
-                            else:
-                                df["material_class"] = extracted_material_class
-                                if "user_uploaded_data" not in st.session_state:
-                                    st.session_state["user_uploaded_data"] = df
-                                else:
-                                    st.session_state["user_uploaded_data"] = pd.concat(
-                                        [st.session_state["user_uploaded_data"], df],
-                                        ignore_index=True
-                                    )
-                                st.success(f"Added to {extracted_material_class} database!")
-                        # if st.button(" Add to Database"):
-                        #     if "user_uploaded_data" not in st.session_state:
-                        #         st.session_state["user_uploaded_data"] = df
-                        #     else:
-                        #         st.session_state["user_uploaded_data"] = pd.concat(
-                        #             [st.session_state["user_uploaded_data"], df],
-                        #             ignore_index=True
-                        #         )
-                        #     st.success("Added to database!")
-                        csv = df.to_csv(index=False)
-                        st.download_button(
-                            "Download CSV",
-                            data=csv,
-                            file_name=f"{paper_id}_data.csv",
-                            mime="text/csv"
-                        )
-                    else:
-                        st.warning("No data extracted")
-                else:
-                    st.error("Failed to extract data from PDF")
-        with tab2:
-            st.subheader("Extracted Plot Images")
-            with st.spinner(" Extracting plots from PDF..."):
-                image_results = extract_images(pdf_path, paper_id=paper_id)
-            if image_results:
-                st.success(f" Extracted {len(image_results)} plots")
-                for r in image_results:
-                    st.markdown(f"**Page {r['page']}** — {r['caption']}")
-                    st.image(r["image"], use_container_width=True)
-                    st.divider()
-            else:
-                st.warning("No plots found in PDF")
-if __name__ == "__main__":
-    main()

src/pages/pages/categorized/__pycache__/page1.cpython-312.pyc DELETED Viewed

Binary file (4.86 kB)

src/pages/pages/categorized/__pycache__/page1.cpython-313.pyc DELETED Viewed

Binary file (4.94 kB)

src/pages/pages/categorized/__pycache__/page1.cpython-314.pyc DELETED Viewed

Binary file (9.83 kB)

src/pages/pages/categorized/__pycache__/page2.cpython-312.pyc DELETED Viewed

Binary file (596 Bytes)

src/pages/pages/categorized/__pycache__/page2.cpython-313.pyc DELETED Viewed

Binary file (596 Bytes)

src/pages/pages/categorized/__pycache__/page2.cpython-314.pyc DELETED Viewed

Binary file (672 Bytes)

src/pages/pages/categorized/__pycache__/page3.cpython-313.pyc DELETED Viewed

Binary file (596 Bytes)

src/pages/pages/categorized/__pycache__/page3.cpython-314.pyc DELETED Viewed

Binary file (2.93 kB)

src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc DELETED Viewed

Binary file (34 kB)

src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc.2029864538672 DELETED Viewed

Binary file (8.01 kB)

src/pages/pages/categorized/__pycache__/page6.cpython-314.pyc.2097035857760 DELETED Viewed

Binary file (1.22 kB)

src/pages/pages/categorized/page1.py DELETED Viewed

@@ -1,307 +0,0 @@
-import streamlit as st
-import pandas as pd
-from PIL import Image
-import re
-def extract_matrix_fiber_from_abbr(abbr: str):
-    if not isinstance(abbr, str):
-        return None, None
-    text = abbr.lower()
-    matrix_map = {
-        "epoxy": "Epoxy",
-        "cyanate ester": "Cyanate Ester",
-        "cynate ester": "Cyanate Ester",
-        "polypropylene": "Polypropylene",
-        "pp": "Polypropylene",
-        "peek": "PEEK",
-        "pei": "PEI",
-        "nylon": "Nylon",
-        "pa6": "PA6",
-        "polyester": "Polyester",
-        "vinyl ester": "Vinyl Ester",
-        "phenolic": "Phenolic"
-    }
-    matrix = None
-    for key, val in matrix_map.items():
-        if key in text:
-            matrix = val
-            break
-    fiber_map = {
-        "carbon": "Carbon Fiber",
-        "glass": "Glass Fiber",
-        "e-glass": "E-Glass Fiber",
-        "s-glass": "S-Glass Fiber",
-        "aramid": "Aramid Fiber",
-        "kevlar": "Kevlar Fiber",
-        "basalt": "Basalt Fiber",
-        "natural": "Natural Fiber"
-    }
-    fiber = None
-    for key, val in fiber_map.items():
-        if key in text:
-            fiber = val
-            break
-    return matrix, fiber
-def main():
-    st.set_page_config(layout="wide")
-    mat_section = st.sidebar.expander("Materials", expanded=False)
-    with mat_section:
-        thermo = mat_section.button("Composites")
-        polymers = mat_section.button("Polymers")
-        Fibers = mat_section.button("Fibers")
-    if "material_type" not in st.session_state:
-        st.session_state.material_type = "Composites"
-    if thermo:
-        st.session_state.material_type = "Composites"
-    elif polymers:
-        st.session_state.material_type = "Polymers"
-    elif Fibers:
-        st.session_state.material_type = "Fibers"
-    @st.cache_data
-    def load_data(material_type):
-        file_map = {
-            "Composites": "src/data/data/Composites_material_data.csv",
-            "Polymers": "src/data/data/polymers_material_data.csv",
-            "Fibers": "src/data/data/Fibers_material_data.csv",
-        }
-        return pd.read_csv(file_map[material_type])
-    csv_data = load_data(st.session_state.material_type)
-    # if "user_uploaded_data" in st.session_state:
-    #      df = pd.concat([csv_data, st.session_state["user_uploaded_data"]], ignore_index=True)
-    # else:
-    #      df = csv_data
-    # Normalize naming between pages
-    CLASS_MAP = {
-        "Polymers": "Polymer",
-        "Fibers": "Fiber",
-        "Composites": "Composite",
-    }
-    current_class = CLASS_MAP[st.session_state.material_type]
-    if "user_uploaded_data" in st.session_state:
-        user_df = st.session_state["user_uploaded_data"]
-        filtered_user_df = user_df[
-            user_df["material_class"] == current_class
-        ]
-        df = pd.concat([csv_data, filtered_user_df], ignore_index=True)
-    else:
-        df = csv_data
-    st.session_state["base_data"] = df
-    st.title("Materials DataSet")
-    materials_df = (
-        df[["material_abbreviation", "material_name"]]
-        .fillna("")
-        .drop_duplicates()
-        .reset_index(drop=True)
-    )
-    materials_df[["Matrix", "Fiber"]] = materials_df["material_abbreviation"].apply(
-    lambda x: pd.Series(extract_matrix_fiber_from_abbr(x))
-    )
-    col1, col2 = st.columns(2, vertical_alignment="center")
-    # st.subheader("Filter Composites")
-    # matrix_options = sorted(
-    #     materials_df["Matrix"].dropna().unique()
-    # )
-    # fiber_options = sorted(
-    #     materials_df["Fiber"].dropna().unique()
-    # )
-    # fcol1, fcol2 = st.columns(2)
-    # with fcol1:
-    #     selected_matrix = st.selectbox(
-    #         "Matrix Material",
-    #         ["All"] + matrix_options
-    #     )
-    # with fcol2:
-    #     selected_fiber = st.selectbox(
-    #         "Fiber Material",
-    #         ["All"] + fiber_options
-    #     )
-    # filtered_materials_df = materials_df.copy()
-    # if selected_matrix != "All":
-    #     filtered_materials_df = filtered_materials_df[
-    #         filtered_materials_df["Matrix"] == selected_matrix
-    #     ]
-    # if selected_fiber != "All":
-    #     filtered_materials_df = filtered_materials_df[
-    #         filtered_materials_df["Fiber"] == selected_fiber
-    #     ]
-    with col1:
-        st.write("Filter Composites")
-        selected_matrix = "All"
-        selected_fiber = "All"
-        if st.session_state.material_type == "Composites":
-            matrix_options = sorted(
-                materials_df["Matrix"].dropna().unique()
-            )
-            fiber_options = sorted(
-                materials_df["Fiber"].dropna().unique()
-            )
-            fcol1, fcol2 = st.columns(2)
-            with fcol1:
-                selected_matrix = st.selectbox(
-                    "Matrix Material",
-                    ["All"] + matrix_options
-                )
-            with fcol2:
-                selected_fiber = st.selectbox(
-                    "Fiber Material",
-                    ["All"] + fiber_options
-                )
-        filtered_materials_df = materials_df.copy()
-        if st.session_state.material_type == "Composites":
-            if selected_matrix != "All":
-                filtered_materials_df = filtered_materials_df[
-                    filtered_materials_df["Matrix"] == selected_matrix
-                ]
-        if selected_fiber != "All":
-            filtered_materials_df = filtered_materials_df[
-                filtered_materials_df["Fiber"] == selected_fiber
-            ]
-        st.write("Select Material")
-        st.dataframe(
-            filtered_materials_df,
-            key="material_table",
-            selection_mode="single-cell",
-            on_select="rerun",
-            use_container_width=True,
-            height=260
-        )
-    def get_selected_value(df, key, column_name):
-        if key in st.session_state:
-            sel = st.session_state[key]["selection"]["cells"]
-            if sel:
-                row_idx = sel[0][0]
-                return df.iloc[row_idx][column_name]
-        return None
-    mat = get_selected_value(materials_df, "material_table", "material_abbreviation")
-    with col2:
-        st.write("Select Property")
-        if mat:
-            filtered_df = df[
-                (df["material_abbreviation"] == mat) &
-                (df["value"].notna()) &
-                (df["property_name"].notna())
-            ]
-            property_sel = st.selectbox(
-                "Type of Property",
-                filtered_df["section"].drop_duplicates()
-            )
-            properties_df = (
-                filtered_df[filtered_df["section"] == property_sel][["property_name", "section"]]
-                .drop_duplicates()
-                .reset_index(drop=True)
-            )
-        else:
-            filtered_df = df[df["value"].notna() & df["property_name"].notna()]
-            property_sel = st.selectbox(
-                "Type of Property",
-                filtered_df["section"].drop_duplicates()
-            )
-            properties_df = (
-                filtered_df[filtered_df["section"] == property_sel][["property_name", "section"]]
-                .drop_duplicates()
-                .reset_index(drop=True)
-            )
-        st.dataframe(
-            properties_df,
-            key="property_table",
-            selection_mode="single-cell",
-            on_select="rerun",
-            use_container_width=True,
-            height=260
-        )
-    prop = get_selected_value(properties_df, "property_table", "property_name")
-    st.write("")
-    if st.button("Search", disabled=not (mat and prop)):
-        st.write(f"**Material:** {mat}")
-        st.write(f"**Property:** {prop}")
-        result = df[
-            (df["material_abbreviation"] == mat) &
-            (df["property_name"] == prop) &
-            (df["value"].notna())
-        ]
-        if not result.empty:
-            st.subheader("Property Data")
-            st.dataframe(result.T, use_container_width=True)
-            st.subheader("Property Graph")
-            img_path = f"src/images/images/{mat}_{prop}.png"
-            try:
-                img = Image.open(img_path)
-                st.image(img, use_container_width=True, caption="Stress strain curve")
-            except FileNotFoundError:
-                st.write("")
-                # fallback_img = Image.open("src/pages/pages/categorized/ESS-min.jpg")
-                # st.image(fallback_img, use_container_width=True, caption="Stress strain curve")
-        else:
-            st.warning("No data found for this material-property combination")

src/pages/pages/categorized/page2.py DELETED Viewed

@@ -1,265 +0,0 @@
-import streamlit as st
-import pandas as pd
-import os
-from PIL import Image
-import boto3
-import tabula
-import faiss
-import json
-import base64
-import pymupdf
-import requests
-import os
-import logging
-import numpy as np
-import warnings
-from tqdm import tqdm
-from botocore.exceptions import ClientError
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from IPython import display
-from langchain_aws import ChatBedrock
-from pathlib import Path
-def main():
-    logger = logging.getLogger(__name__)
-    logger.setLevel(logging.ERROR)
-    warnings.filterwarnings("ignore")
-    def create_directories(base_dir):
-        directories = ["images", "text", "tables", "page_images"]
-        for dir in directories:
-            os.makedirs(os.path.join(base_dir, dir), exist_ok=True)
-    def process_tables(doc, page_num, base_dir, items):
-        try:
-            tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
-            if not tables:
-                return
-            for table_idx, table in enumerate(tables):
-                table_text = "\n".join([" | ".join(map(str, row)) for row in table.values])
-                table_file_name = f"{base_dir}/tables/{os.path.basename(filepath)}_table_{page_num}_{table_idx}.txt"
-                with open(table_file_name, 'w') as f:
-                    f.write(table_text)
-                items.append({"page": page_num, "type": "table", "text": table_text, "path": table_file_name})
-        except Exception as e:
-            print(f"Error extracting tables from page {page_num}: {str(e)}")
-        doc = pymupdf.open(filepath)
-        num_pages = len(doc)
-        base_dir = "data"
-        # Creating the directories
-        create_directories(base_dir)
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200, length_function=len)
-        items = []
-        # Process each page of the PDF
-        for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
-            page = doc[page_num]
-            process_tables(doc, page_num, base_dir, items)
-        [i for i in items if i['type'] == 'table'][0]
-        # Generating Multimodal Embeddings using Amazon Titan Multimodal Embeddings model
-    def generate_multimodal_embeddings(prompt=None, image=None, output_embedding_length=384):
-        """
-        Invoke the Amazon Titan Multimodal Embeddings model using Amazon Bedrock runtime.
-        Args:
-            prompt (str): The text prompt to provide to the model.
-            image (str): A base64-encoded image data.
-        Returns:
-            str: The model's response embedding.
-        """
-        if not prompt and not image:
-            raise ValueError("Please provide either a text prompt, base64 image, or both as input")
-        # Initialize the Amazon Bedrock runtime client
-        client = boto3.client(service_name="bedrock-runtime")
-        model_id = "amazon.titan-embed-image-v1"
-        body = {"embeddingConfig": {"outputEmbeddingLength": output_embedding_length}}
-        if prompt:
-            body["inputText"] = prompt
-        if image:
-            body["inputImage"] = image
-        try:
-            response = client.invoke_model(
-                modelId=model_id,
-                body=json.dumps(body),
-                accept="application/json",
-                contentType="application/json"
-            )
-            # Process and return the response
-            result = json.loads(response.get("body").read())
-            return result.get("embedding")
-        except ClientError as err:
-            print(f"Couldn't invoke Titan embedding model. Error: {err.response['Error']['Message']}")
-            return None
-        # Set embedding vector dimension
-    embedding_vector_dimension = 384
-    # Count the number of each type of item
-    item_counts = {
-        'text': sum(1 for item in items if item['type'] == 'text'),
-        'table': sum(1 for item in items if item['type'] == 'table'),
-        'image': sum(1 for item in items if item['type'] == 'image'),
-        'page': sum(1 for item in items if item['type'] == 'page')
-    }
-    # Initialize counters
-    counters = dict.fromkeys(item_counts.keys(), 0)
-    # Generate embeddings for all items
-    with tqdm(
-        total=len(items),
-        desc="Generating embeddings",
-        bar_format=(
-            "{l_bar}{bar}| {n_fmt}/{total_fmt} "
-            "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
-        )
-    ) as pbar:
-        for item in items:
-            item_type = item['type']
-            counters[item_type] += 1
-            if item_type in ['text', 'table']:
-                # For text or table, use the formatted text representation
-                item['embedding'] = generate_multimodal_embeddings(prompt=item['text'],output_embedding_length=embedding_vector_dimension)
-            else:
-                # For images, use the base64-encoded image data
-                item['embedding'] = generate_multimodal_embeddings(image=item['image'], output_embedding_length=embedding_vector_dimension)
-            # Update the progress bar
-            pbar.set_postfix_str(f"Text: {counters['text']}/{item_counts['text']}, Table: {counters['table']}/{item_counts['table']}, Image: {counters['image']}/{item_counts['image']}")
-            pbar.update(1)
-            # All the embeddings
-    all_embeddings = np.array([item['embedding'] for item in items])
-    # Create FAISS Index
-    index = faiss.IndexFlatL2(embedding_vector_dimension)
-    # Clear any pre-existing index
-    index.reset()
-    # Add embeddings to the index
-    index.add(np.array(all_embeddings, dtype=np.float32))
-    # Generating RAG response with Amazon Nova
-    def invoke_nova_multimodal(prompt, matched_items):
-        """
-        Invoke the Amazon Nova model.
-        """
-        # Define your system prompt(s).
-        system_msg = [
-                            { "text": """You are a helpful assistant for question answering.
-                                        The text context is relevant information retrieved.
-                                        The provided image(s) are relevant information retrieved."""}
-                    ]
-        # Define one or more messages using the "user" and "assistant" roles.
-        message_content = []
-        for item in matched_items:
-            if item['type'] == 'text' or item['type'] == 'table':
-                message_content.append({"text": item['text']})
-            else:
-                message_content.append({"image": {
-                                                    "format": "png",
-                                                    "source": {"bytes": item['image']},
-                                                }
-                                        })
-        # Configure the inference parameters.
-        inf_params = {"max_new_tokens": 300,
-                    "top_p": 0.9,
-                    "top_k": 20}
-        # Define the final message list
-        message_list = [
-            {"role": "user", "content": message_content}
-        ]
-        # Adding the prompt to the message list
-        message_list.append({"role": "user", "content": [{"text": prompt}]})
-        native_request = {
-            "messages": message_list,
-            "system": system_msg,
-            "inferenceConfig": inf_params,
-        }
-        # Initialize the Amazon Bedrock runtime client
-        model_id = "amazon.nova-pro-v1:0"
-        client = ChatBedrock(model_id=model_id)
-        # Invoke the model and extract the response body.
-        response = client.invoke(json.dumps(native_request))
-        model_response = response.content
-        return model_response
-    # User Query
-    query = "Which optimizer was used when training the models?"
-    # Generate embeddings for the query
-    query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)
-    # Search for the nearest neighbors in the vector database
-    distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)
-    # Check the result (matched chunks)
-    result.flatten()
-    # Retrieve the matched items
-    matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]
-    # Generate RAG response with Amazon Nova
-    response = invoke_nova_multimodal(query, matched_items)
-    display.Markdown(response)
-    # List of queries (Replace with any query of your choice)
-    other_queries = ["How long were the base and big models trained?",
-                    "Which optimizer was used when training the models?",
-                    "What is the position-wise feed-forward neural network mentioned in the paper?",
-                    "What is the BLEU score of the model in English to German translation (EN-DE)?",
-                    "How is the scaled-dot-product attention is calculated?",
-                    ]
-    query = other_queries[0] # Replace with any query from the list above
-    # Generate embeddings for the query
-    query_embedding = generate_multimodal_embeddings(prompt=query,output_embedding_length=embedding_vector_dimension)
-    # Search for the nearest neighbors in the vector database
-    distances, result = index.search(np.array(query_embedding, dtype=np.float32).reshape(1,-1), k=5)
-    # Retrieve the matched items
-    matched_items = [{k: v for k, v in items[index].items() if k != 'embedding'} for index in result.flatten()]
-    # Generate RAG response with Amazon Nova
-    response = invoke_nova_multimodal(query, matched_items)
-    # Display the response
-    display.Markdown(response)

src/pages/pages/categorized/page3.py DELETED Viewed

@@ -1,62 +0,0 @@
-import streamlit as st
-import pandas as pd
-import tabula
-import pymupdf
-import os
-from tqdm import tqdm
-def extract_tables_pymupdf(pdf_path):
-    """Extract tables using PyMuPDF (alternative method)"""
-    try:
-        doc = pymupdf.open(pdf_path)
-        all_tables = []
-        for page_num in range(len(doc)):
-            page = doc[page_num]
-            tables = page.find_tables()
-            for table in tables:
-                # Extract table data
-                table_data = table.extract()
-                if table_data:
-                    # Convert to DataFrame
-                    df = pd.DataFrame(table_data[1:], columns=table_data[0])
-                    all_tables.append({
-                        'page': page_num + 1,
-                        'dataframe': df
-                    })
-        doc.close()
-        return all_tables
-    except Exception as e:
-        st.error(f"Error extracting tables with PyMuPDF: {e}")
-        return []
-def main():
-    st.title("PDF Table Extractor")
-    st.write("Upload a PDF to extract all tables")
-    temp_path = "temp_uploaded.pdf"  # Define here
-    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
-    if uploaded_file is not None:
-        # Save uploaded file temporarily
-        with open(temp_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        # Using PyMuPDF
-        tables = extract_tables_pymupdf(temp_path)
-        if tables:
-            st.success(f"Found {len(tables)} tables!")
-            for idx, table_info in enumerate(tables):
-                st.subheader(f"Table {idx + 1} (Page {table_info['page']})")
-                df = table_info['dataframe']
-                st.dataframe(df, use_container_width=True)
-        # Clean up temp file
-        if os.path.exists(temp_path):
-            os.remove(temp_path)

src/pages/pages/categorized/page4.py DELETED Viewed

@@ -1,5 +0,0 @@
-import streamlit as st
-from pathlib import Path
-def main():
-    st.write(f'# {Path(__file__).parent.name} - {Path(__file__).name}')

src/pages/pages/categorized/page5.py DELETED Viewed

@@ -1,5 +0,0 @@
-import streamlit as st
-from pathlib import Path
-def main():
-    st.write(f'# {Path(__file__).parent.name} - {Path(__file__).name}')

src/pages/pages/categorized/page6.py DELETED Viewed

@@ -1,671 +0,0 @@
-import os
-import re
-import json
-import tempfile
-import zipfile
-from io import BytesIO
-import fitz  # PyMuPDF
-import cv2
-import numpy as np
-import streamlit as st
-import pandas as pd
-import requests
-import base64
-from typing import Dict, Any, Optional
-from collections import defaultdict
-API_KEY = "AIzaSyCD5_sTXRhr4cpBrM08V7UhWNNc1KmaI9I"
-API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
-SCHEMA = {
-    "type": "OBJECT",
-    "properties": {
-        "material_name": {"type": "STRING"},
-        "material_abbreviation": {"type": "STRING"},
-        "mechanical_properties": {
-            "type": "ARRAY",
-            "items": {
-                "type": "OBJECT",
-                "properties": {
-                    "section": {"type": "STRING"},
-                    "property_name": {"type": "STRING"},
-                    "value": {"type": "STRING"},
-                    "unit": {"type": "STRING"},
-                    "english": {"type": "STRING"},
-                    "test_condition": {"type": "STRING"},
-                    "comments": {"type": "STRING"}
-                },
-                "required": ["section", "property_name", "value", "english", "comments"]
-            }
-        }
-    }
-}
-def make_abbreviation(name: str) -> str:
-    """Create a simple abbreviation from the material name."""
-    if not name:
-        return "UNKNOWN"
-    words = name.split()
-    abbr = "".join(w[0] for w in words if w and w[0].isalpha()).upper()
-    return abbr or name[:6].upper()
-DPI = 300
-CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
-def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
-    """Calls Gemini API with PDF bytes"""
-    try:
-        encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
-        mime_type = "application/pdf"
-    except Exception as e:
-        st.error(f"Error encoding PDF: {e}")
-        return None
-    prompt = (
-        "You are an expert materials scientist. From the attached PDF, extract the material name, "
-        "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
-        "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
-        "For each property, you MUST extract:\n"
-        "- property_name\n- value (or range)\n- unit\n"
-        "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
-        "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
-        "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
-    )
-    payload = {
-        "contents": [{
-            "parts": [
-                {"text": prompt},
-                {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
-            ]
-        }],
-        "generationConfig": {
-            "temperature": 0,
-            "responseMimeType": "application/json",
-            "responseSchema": SCHEMA
-        }
-    }
-    try:
-        r = requests.post(API_URL, json=payload, timeout=300)
-        r.raise_for_status()
-        data = r.json()
-        candidates = data.get("candidates", [])
-        if not candidates:
-            return None
-        parts = candidates[0].get("content", {}).get("parts", [])
-        json_text = None
-        for p in parts:
-            t = p.get("text", "")
-            if t.strip().startswith("{"):
-                json_text = t
-                break
-        return json.loads(json_text) if json_text else None
-    except Exception as e:
-        st.error(f"Gemini API Error: {e}")
-        return None
-def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
-    """Convert extracted JSON to DataFrame, ensuring abbreviation is not empty."""
-    mat_name = data.get("material_name", "") or ""
-    mat_abbr = data.get("material_abbreviation", "") or ""
-    if not mat_abbr:
-        mat_abbr = make_abbreviation(mat_name)
-    rows = []
-    for item in data.get("mechanical_properties", []):
-        rows.append({
-            "material_name": mat_name,
-            "material_abbreviation": mat_abbr,
-            "section": item.get("section", "") or "Mechanical",
-            "property_name": item.get("property_name", "") or "Unknown property",
-            "value": item.get("value", "") or "N/A",
-            "unit": item.get("unit", "") or "",
-            "english": item.get("english", "") or "",
-            "test_condition": item.get("test_condition", "") or "",
-            "comments": item.get("comments", "") or "",
-        })
-    return pd.DataFrame(rows)
-# --- IMAGE EXTRACTION LOGIC ---
-def get_page_image(page):
-    pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
-    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
-    return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-def is_valid_plot_geometry(binary_crop):
-    h, w = binary_crop.shape
-    if h < 100 or w < 100:
-        return False
-    ink_density = cv2.countNonZero(binary_crop) / (w * h)
-    if ink_density > 0.35:
-        return False
-    h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w // 4, 1))
-    v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, h // 4))
-    has_h = cv2.countNonZero(cv2.erode(binary_crop, h_kernel, iterations=1)) > 0
-    has_v = cv2.countNonZero(cv2.erode(binary_crop, v_kernel, iterations=1)) > 0
-    return has_h or has_v
-def merge_boxes(rects):
-    if not rects:
-        return []
-    rects = sorted(rects, key=lambda r: r[2] * r[3], reverse=True)
-    merged = []
-    for r in rects:
-        rx, ry, rw, rh = r
-        if not any(rx >= m[0]-15 and ry >= m[1]-15 and rx+rw <= m[0]+m[2]+15 and ry+rh <= m[1]+m[3]+15 for m in merged):
-            merged.append(r)
-    return merged
-def extract_images(pdf_doc):
-    """Extract plot images from PDF using improved logic"""
-    grouped_data = defaultdict(lambda: {"page": 0, "image_data": []})
-    PADDING = 30
-    for page_num, page in enumerate(pdf_doc, start=1):
-        img_bgr = get_page_image(page)
-        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-        _, binary = cv2.threshold(gray, 225, 255, cv2.THRESH_BINARY_INV)
-        kernel = np.ones((10, 10), np.uint8)
-        dilated = cv2.dilate(binary, kernel, iterations=1)
-        contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        candidates = []
-        page_h, page_w = gray.shape
-        for cnt in contours:
-            x, y, w, h = cv2.boundingRect(cnt)
-            if 0.03 < (w * h) / (page_w * page_h) < 0.8:
-                if is_valid_plot_geometry(binary[y:y+h, x:x+w]):
-                    candidates.append((x, y, w, h))
-        final_rects = merge_boxes(candidates)
-        blocks = page.get_text("blocks")
-        for (cx, cy, cw, ch) in final_rects:
-            best_caption = f"Figure on Page {page_num} (Unlabeled)"
-            min_dist = float('inf')
-            for b in blocks:
-                text = b[4].strip()
-                if CAP_RE.match(text):
-                    cap_y = b[1] * (DPI/72)
-                    dist = cap_y - (cy + ch)
-                    if 0 < dist < (page_h * 0.3) and dist < min_dist:
-                        best_caption = text.replace('\n', ' ')
-                        min_dist = dist
-            x1, y1 = max(0, cx - PADDING), max(0, cy - PADDING)
-            x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
-            crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
-            # Store image data in memory instead of saving to disk
-            _, buffer = cv2.imencode('.png', crop)
-            img_bytes = buffer.tobytes()
-            fname = f"pg{page_num}_{cx}_{cy}.png"
-            grouped_data[best_caption]["page"] = page_num
-            grouped_data[best_caption]["image_data"].append({
-                "filename": fname,
-                "bytes": img_bytes,
-                "array": crop
-            })
-    results = [{"caption": k, "page": v["page"], "image_data": v["image_data"]} for k, v in grouped_data.items()]
-    return results
-def create_zip(results, include_json=True):
-    """Create a zip file with images and optional JSON"""
-    buf = BytesIO()
-    with zipfile.ZipFile(buf, "w") as z:
-        if include_json:
-            json_data = [{"caption": r["caption"], "page": r["page"],
-                         "image_count": len(r["image_data"])} for r in results]
-            z.writestr("plot_data.json", json.dumps(json_data, indent=4))
-        for item in results:
-            for img_data in item['image_data']:
-                z.writestr(img_data['filename'], img_data['bytes'])
-    buf.seek(0)
-    return buf.getvalue()
-def input_form():
-    PROPERTY_CATEGORIES = {
-        "Polymer": [
-            "Thermal",
-            "Mechanical",
-            "Processing",
-            "Physical",
-            "Descriptive",
-        ],
-        "Fiber": [
-            "Mechanical",
-            "Physical",
-            "Thermal",
-            "Descriptive",
-        ],
-        "Composite": [
-            "Mechanical",
-            "Thermal",
-            "Processing",
-            "Physical",
-            "Descriptive",
-            "Composition / Reinforcement",
-            "Architecture / Structure",
-        ],
-    }
-    PROPERTY_NAMES = {
-        "Polymer": {
-            "Thermal": [
-                "Glass transition temperature (Tg)",
-                "Melting temperature (Tm)",
-                "Crystallization temperature (Tc)",
-                "Degree of crystallinity",
-                "Decomposition temperature",
-            ],
-            "Mechanical": [
-                "Tensile modulus",
-                "Tensile strength",
-                "Elongation at break",
-                "Flexural modulus",
-                "Impact strength",
-            ],
-            "Processing": [
-                "Melt flow index (MFI)",
-                "Processing temperature",
-                "Cooling rate",
-                "Mold shrinkage",
-            ],
-            "Physical": [
-                "Density",
-                "Specific gravity",
-            ],
-            "Descriptive": [
-                "Material grade",
-                "Manufacturer",
-            ],
-        },
-        "Fiber": {
-            "Mechanical": [
-                "Tensile modulus",
-                "Tensile strength",
-                "Strain to failure",
-            ],
-            "Physical": [
-                "Density",
-                "Fiber diameter",
-            ],
-            "Thermal": [
-                "Decomposition temperature",
-            ],
-            "Descriptive": [
-                "Fiber type",
-                "Surface treatment",
-            ],
-        },
-        "Composite": {
-            "Mechanical": [
-                "Longitudinal modulus (E1)",
-                "Transverse modulus (E2)",
-                "Shear modulus (G12)",
-                "Poissons ratio (V12)",
-                "Tensile strength (fiber direction)",
-                "Interlaminar shear strength",
-            ],
-            "Thermal": [
-                "Glass transition temperature (matrix)",
-                "Coefficient of thermal expansion (CTE)",
-            ],
-            "Processing": [
-                "Curing temperature",
-                "Curing pressure",
-            ],
-            "Physical": [
-                "Density",
-            ],
-            "Descriptive": [
-                "Laminate type",
-            ],
-            "Composition / Reinforcement": [
-                "Fiber volume fraction",
-                "Fiber weight fraction",
-                "Fiber type",
-                "Matrix type",
-            ],
-            "Architecture / Structure": [
-                "Weave type",
-                "Ply orientation",
-                "Number of plies",
-                "Stacking sequence",
-            ],
-        },
-    }
-    st.title("Materials Property Input Form")
-    material_class = st.selectbox(
-        "Select Material Class",
-        ("Polymer", "Fiber", "Composite"),
-        index=None,
-        placeholder="Choose material class",
-    )
-    if material_class:
-        property_category = st.selectbox(
-            "Select Property Category",
-            PROPERTY_CATEGORIES[material_class],
-            index=None,
-            placeholder="Choose property category",
-        )
-    else:
-        property_category = None
-    if material_class and property_category:
-        property_name = st.selectbox(
-            "Select Property",
-            PROPERTY_NAMES[material_class][property_category],
-            index=None,
-            placeholder="Choose property",
-        )
-    else:
-        property_name = None
-    if material_class and property_category and property_name:
-        with st.form("user_input"):
-            st.subheader("Enter Data")
-            material_name = st.text_input("Material Name")
-            material_abbr = st.text_input("Material Abbreviation")
-            value = st.text_input("Value")
-            unit = st.text_input("Unit (SI)")
-            english = st.text_input("English Units")
-            test_condition = st.text_input("Test Condition")
-            comments = st.text_area("Comments")
-            submitted = st.form_submit_button("Submit")
-            if submitted:
-                if not (material_name and value):
-                    st.error("Material name and value are required.")
-                else:
-                    Input_db = pd.DataFrame([{
-                        "material_class": material_class,
-                        "material_name": material_name,
-                        "material_abbreviation": material_abbr,
-                        "section": property_category,
-                        "property_name": property_name,
-                        "value": value,
-                        "unit": unit,
-                        "english_units": english,
-                        "test_condition": test_condition,
-                        "comments": comments
-                    }])
-                    st.success("Property added successfully")
-                    st.dataframe(Input_db)
-                    if "user_uploaded_data" not in st.session_state:
-                        st.session_state["user_uploaded_data"] = Input_db
-                        return
-                    else:
-                        st.session_state["user_uploaded_data"] = pd.concat(
-                        [st.session_state["user_uploaded_data"], Input_db],
-                        ignore_index=True
-                        )
-                return
-def main():
-    st.set_page_config(page_title="PDF Data & Image Extractor", layout="wide")
-    if 'image_results' not in st.session_state:
-        st.session_state.image_results = []
-    if 'pdf_processed' not in st.session_state:
-        st.session_state.pdf_processed = False
-    if 'current_pdf_name' not in st.session_state:
-        st.session_state.current_pdf_name = None
-    if 'form_submitted' not in st.session_state:
-        st.session_state.form_submitted = False
-    if 'pdf_data_extracted' not in st.session_state:
-        st.session_state.pdf_data_extracted = False
-    if 'pdf_extracted_df' not in st.session_state:
-        st.session_state.pdf_extracted_df = pd.DataFrame()
-    prev_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
-    input_form()
-    curr_uploaded_count = len(st.session_state.get("user_uploaded_data", pd.DataFrame()))
-    if curr_uploaded_count > prev_uploaded_count:
-        st.session_state.form_submitted = True
-    st.title("PDF Material Data & Plot Extractor")
-    uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
-    if not uploaded_file:
-        st.info("Upload a PDF to extract material data and plots")
-        st.session_state.pdf_processed = False
-        st.session_state.current_pdf_name = None
-        st.session_state.image_results = []
-        st.session_state.form_submitted = False
-        st.session_state.pdf_data_extracted = False
-        st.session_state.pdf_extracted_df = pd.DataFrame()
-        return
-    paper_id = os.path.splitext(uploaded_file.name)[0].replace(" ", "_")
-    if st.session_state.current_pdf_name != uploaded_file.name:
-        st.session_state.pdf_processed = False
-        st.session_state.current_pdf_name = uploaded_file.name
-        st.session_state.image_results = []
-        st.session_state.form_submitted = False
-    if st.session_state.form_submitted:
-        st.session_state.form_submitted = False
-        st.info("A Form was submitted. But your previous extracted data has been added already. If you want to extract more data/plots" \
-        "upload again")
-        tab1, tab2 = st.tabs(["Material Data", "Extracted Plots"])
-        with tab1:
-            st.info("Material data from form has been added to database.")
-        with tab2:
-            st.info("Plots already extracted")
-        return
-    tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
-    with tempfile.TemporaryDirectory() as tmpdir:
-        pdf_path = os.path.join(tmpdir, uploaded_file.name)
-        with open(pdf_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        with tab1:
-            st.subheader("Material Properties Data")
-            # Only call Gemini once per PDF
-            if not st.session_state.pdf_data_extracted:
-                with st.spinner(" Extracting material data..."):
-                    with open(pdf_path, "rb") as f:
-                        pdf_bytes = f.read()
-                    data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
-                    if data:
-                        df = convert_to_dataframe(data)
-                        if not df.empty:
-                            st.session_state.pdf_extracted_df = df
-                            st.session_state.pdf_data_extracted = True
-                            st.session_state.pdf_extracted_meta = data  # optional: keep raw meta
-                        else:
-                            st.warning("No data extracted")
-                    else:
-                        st.error("Failed to extract data from PDF")
-            # After extraction, or when rerunning, use stored data
-            df = st.session_state.pdf_extracted_df
-            if not df.empty:
-                data = st.session_state.get("pdf_extracted_meta", {})
-                st.success(f" Extracted {len(df)} properties")
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.metric("Material", data.get("material_name", "N/A"))
-                with col2:
-                    st.metric("Abbreviation", data.get("material_abbreviation", "N/A"))
-                st.dataframe(df, use_container_width=True, height=400)
-                st.subheader("Assign Material Category")
-                extracted_material_class = st.selectbox(
-                    "Select category for this material",
-                    ["Polymer", "Fiber", "Composite"],
-                    index=None,
-                    placeholder="Required before adding to database"
-                )
-                if st.button(" Add to Database"):
-                    if not extracted_material_class:
-                        st.error("Please select a material category before adding.")
-                    else:
-                        df["material_class"] = extracted_material_class
-                        # Optional: add material_type for Page 1 filtering
-                        df["material_type"] = extracted_material_class
-                        if "user_uploaded_data" not in st.session_state:
-                            st.session_state["user_uploaded_data"] = df
-                        else:
-                            st.session_state["user_uploaded_data"] = pd.concat(
-                                [st.session_state["user_uploaded_data"], df],
-                                ignore_index=True
-                            )
-                        st.success(f"Added to {extracted_material_class} database!")
-                csv = df.to_csv(index=False)
-                st.download_button(
-                    "⬇ Download CSV",
-                    data=csv,
-                    file_name=f"{paper_id}_data.csv",
-                    mime="text/csv"
-                )
-        with tab2:
-            st.subheader("Extracted Plot Images")
-            if not st.session_state.pdf_processed:
-                with st.spinner(" Extracting plots from PDF..."):
-                    doc = fitz.open(pdf_path)
-                    st.session_state.image_results = extract_images(doc)
-                    doc.close()
-                    st.session_state.pdf_processed = True
-            if st.session_state.image_results:
-                subtab1, subtab2 = st.tabs([" Images", " JSON Preview"])
-                with subtab1:
-                    st.success(f" Extracted {len(st.session_state.image_results)} plots")
-                    col_img, col_json, col_all = st.columns(3)
-                    with col_img:
-                        img_zip = create_zip(st.session_state.image_results, include_json=False)
-                        st.download_button(
-                            " Download Images Only",
-                            data=img_zip,
-                            file_name=f"{paper_id}_images.zip",
-                            mime="application/zip",
-                            use_container_width=True,
-                            key="download_images"
-                        )
-                    with col_json:
-                        json_data = [{"caption": r["caption"], "page": r["page"],
-                                     "image_count": len(r["image_data"])} for r in st.session_state.image_results]
-                        st.download_button(
-                            " Download JSON",
-                            data=json.dumps(json_data, indent=4),
-                            file_name=f"{paper_id}_metadata.json",
-                            mime="application/json",
-                            use_container_width=True,
-                            key="download_json_top"
-                        )
-                    with col_all:
-                        full_zip = create_zip(st.session_state.image_results, include_json=True)
-                        st.download_button(
-                            " Download All",
-                            data=full_zip,
-                            file_name=f"{paper_id}_complete.zip",
-                            mime="application/zip",
-                            use_container_width=True,
-                            key="download_all"
-                        )
-                    st.divider()
-                    results_copy = st.session_state.image_results.copy()
-                    for idx in range(len(results_copy)):
-                        if idx >= len(st.session_state.image_results):
-                            break
-                        r = st.session_state.image_results[idx]
-                        with st.container(border=True):
-                            col_cap, col_btn = st.columns([0.85, 0.15])
-                            col_cap.markdown(f"**Page {r['page']}**  {r['caption']}")
-                            if col_btn.button(" Delete", key=f"del_g_{idx}_{r['page']}"):
-                                del st.session_state.image_results[idx]
-                                st.rerun()
-                            image_data_list = r['image_data']
-                            if image_data_list and len(image_data_list) > 0:
-                                cols = st.columns(len(image_data_list))
-                                for p_idx in range(len(image_data_list)):
-                                    if p_idx >= len(st.session_state.image_results[idx]['image_data']):
-                                        break
-                                    img_data = st.session_state.image_results[idx]['image_data'][p_idx]
-                                    with cols[p_idx]:
-                                        st.image(img_data['array'], width=img_width, channels="BGR")
-                                        if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
-                                            del st.session_state.image_results[idx]['image_data'][p_idx]
-                                            if len(st.session_state.image_results[idx]['image_data']) == 0:
-                                                del st.session_state.image_results[idx]
-                                            st.rerun()
-                with subtab2:
-                    st.subheader("Metadata Preview")
-                    json_data = [{"caption": r["caption"], "page": r["page"],
-                                 "image_count": len(r["image_data"]),
-                                 "images": [img["filename"] for img in r["image_data"]]}
-                                for r in st.session_state.image_results]
-                    st.download_button(
-                        " Download JSON",
-                        data=json.dumps(json_data, indent=4),
-                        file_name=f"{paper_id}_metadata.json",
-                        mime="application/json",
-                        key="download_json_bottom"
-                    )
-                    st.json(json_data)
-            else:
-                st.warning("No plots found in PDF")
-if __name__ == "__main__":
-    main()

src/pages/pages/categorized/propgraph.jpg DELETED Viewed

Binary file (83.4 kB)