MatDatabase_Dev

Running

File size: 9,897 Bytes

1adc2e7

import streamlit as st
import pandas as pd
from PIL import Image
import requests
import base64
import json
import os
from typing import Dict, Any, Optional




# Backend PDF extraction Logic
API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"

SCHEMA = {
    "type": "OBJECT",
    "properties": {
        "material_name": {"type": "STRING"},
        "material_abbreviation": {"type": "STRING"},
        "mechanical_properties": {
            "type": "ARRAY",
            "items": {
                "type": "OBJECT",
                "properties": {
                    "section": {"type": "STRING"},
                    "property_name": {"type": "STRING"},
                    "value": {"type": "STRING"},
                    "unit": {"type": "STRING"},
                    "english": {"type": "STRING"},
                    "test_condition": {"type": "STRING"},
                    "comments": {"type": "STRING"}
                },
                "required": ["section", "property_name", "value", "english", "comments"]
            }
        }
    }
}

# === GEMINI CALL FUNCTION ===
def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
    """Calls Gemini API with PDF bytes"""
    try:
        encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
        mime_type = "application/pdf"
    except Exception as e:
        st.error(f"Error encoding PDF: {e}")
        return None

    prompt = (
         "Extract all experimental data from this research paper. "
         "For each measurement, extract: "
         "- experiment_name, measured_value, unit, uncertainty, method, conditions. "
         "Return as JSON."
        # "You are an expert materials scientist. From the attached PDF, extract the material name, "
        # "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
        # "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
        # "For each property, you MUST extract:\n"
        # "- property_name\n- value (or range)\n- unit\n"
        # "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
        # "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
        # "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
    )

    payload = {
        "contents": [
            {
                "parts": [
                    {"text": prompt},
                    {"inlineData": {"mimeType": mime_type, "data": encoded_file}}
                ]
            }
        ],
        "generationConfig": {
            "temperature": 0,
            "responseMimeType": "application/json",
            "responseSchema": SCHEMA
        }
    }

    try:
        r = requests.post(API_URL, json=payload, timeout=300)
        r.raise_for_status()
        data = r.json()
        
        candidates = data.get("candidates", [])
        if not candidates:
            return None

        parts = candidates[0].get("content", {}).get("parts", [])
        json_text = None
        for p in parts:
            t = p.get("text", "")
            if t.strip().startswith("{"):
                json_text = t
                break

        return json.loads(json_text) if json_text else None
    except Exception as e:
        st.error(f"Gemini API Error: {e}")
        return None


def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
    """Convert extracted JSON to DataFrame"""
    rows = []
    for item in data.get("mechanical_properties", []):
        rows.append({
            "material_name": data.get("material_name", ""),
            "material_abbreviation": data.get("material_abbreviation", ""),
            "section": item.get("section", ""),
            "property_name": item.get("property_name", ""),
            "value": item.get("value", ""),
            "unit": item.get("unit", ""),
            "english": item.get("english", ""),
            "test_condition": item.get("test_condition", ""),
            "comments": item.get("comments", "")
        })
    return pd.DataFrame(rows)



#using sentence transformers and semantic search techniques
import sqlite3
import pandas as pd
import os
import requests
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ==========================
# CONFIGURATION
# ==========================
DB_PATH = "output_materials.db"
EXCEL_PATH = "5.1__actual.xlsx"
OUTPUT_EXCEL = "5.1__filled.xlsx"
GEMINI_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")

GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"


# ==========================
# GEMINI YES/NO MATCH CHECK
# ==========================
def gemini_same_property(excel_prop, db_prop):
    prompt = f"""

You are an expert materials scientist. Determine if BOTH property names refer

to the SAME mechanical property.



Excel property: "{excel_prop}"

Database property: "{db_prop}"



Rules:

- Compare meaning, not formatting.

- Ignore units, values, and numbers.

- If either refers to conditions, test setup, or non-property info, return NO.

- Return ONLY YES or NO.

"""

    payload = {
        "contents": [{"parts": [{"text": prompt}]}]
    }

    response = requests.post(
        GEMINI_URL,
        params={"key": GEMINI_KEY},
        json=payload,
        timeout=60
    ).json()

    try:
        ans = response["candidates"][0]["content"]["parts"][0]["text"].strip().upper()
    except:
        return False

    return ans == "YES"


# ==========================
# SEMANTIC MATCHER (fallback)
# ==========================
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_match(excel_prop, df_section):
    if df_section.empty:
        return None

    # compute embeddings
    db_props = df_section["property_name"].tolist()
    db_vecs = embed_model.encode(db_props, convert_to_numpy=True)
    q_vec = embed_model.encode([excel_prop], convert_to_numpy=True)

    sims = cosine_similarity(q_vec, db_vecs)[0]

    df_section = df_section.copy()
    df_section["sim"] = sims
    df_section = df_section.sort_values("sim", ascending=False)

    # Take top-5 candidates for Gemini check
    top5 = df_section.head(5)

    for _, row in top5.iterrows():
        cand = row["property_name"]
        if gemini_same_property(excel_prop, cand):
            return row

    return None


# ==========================
# MAIN PIPELINE
# ==========================
conn = sqlite3.connect(DB_PATH)

# Get material tables
tables = pd.read_sql_query(
    "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';",
    conn
)["name"].tolist()

print(f"Detected tables: {tables}")

# Load Excel template once
df_excel_template = pd.read_excel(EXCEL_PATH)
cols = df_excel_template.columns.tolist()

section_col = next((c for c in cols if "section" in c.lower()), None)
prop_col = next((c for c in cols if "property" in c.lower()), cols[0])

print(f"Detected section column: {section_col}")
print(f"Detected property column: {prop_col}")

with pd.ExcelWriter(OUTPUT_EXCEL, engine="openpyxl") as writer:

    for table in tables:
        print(f"\nProcessing table: {table}")

        # Load DB table
        df_db = pd.read_sql_query(f"""

            SELECT section, property_name, value, unit, english, comments

            FROM '{table}'

        """, conn)

        df_excel = df_excel_template.copy()
        df_excel["Matched Property"] = ""
        df_excel["Value"] = ""
        df_excel["Unit"] = ""
        df_excel["English"] = ""
        df_excel["Comments"] = ""

        # Process each Excel property
        for i, row in df_excel.iterrows():
            excel_prop = str(row[prop_col]).strip()
            excel_section = str(row.get(section_col, "")).strip().lower()


            if section_col:
                df_sec = df_db[df_db["section"].str.lower() == excel_section]
            else:
                df_sec = df_db

            # ==========================
            # 1️ EXACT MATCH
            # ==========================
            exact = df_sec[df_sec["property_name"].str.lower() == excel_prop.lower()]

            if not exact.empty:
                r = exact.iloc[0]
                df_excel.at[i, "Matched Property"] = r["property_name"]
                df_excel.at[i, "Value"] = r["value"]
                df_excel.at[i, "Unit"] = r["unit"]
                df_excel.at[i, "English"] = r["english"]
                df_excel.at[i, "Comments"] = r["comments"]
                continue  # done

            # ==========================
            # 2️ SEMANTIC + GEMINI MATCH
            # ==========================
            best = semantic_match(excel_prop, df_sec)

            if best is not None:
                df_excel.at[i, "Matched Property"] = best["property_name"]
                df_excel.at[i, "Value"] = best["value"]
                df_excel.at[i, "Unit"] = best["unit"]
                df_excel.at[i, "English"] = best["english"]
                df_excel.at[i, "Comments"] = best["comments"]
            else:
                df_excel.at[i, "Matched Property"] = ""

        # Write one sheet per material
        df_excel.to_excel(writer, sheet_name=table[:31], index=False)

print(f"\nDONE → Final filled Excel: {OUTPUT_EXCEL}")
conn.close()