Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| from PIL import Image | |
| import requests | |
| import base64 | |
| import json | |
| import os | |
| from typing import Dict, Any, Optional | |
| # Backend PDF extraction Logic | |
| API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") | |
| API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}" | |
| SCHEMA = { | |
| "type": "OBJECT", | |
| "properties": { | |
| "material_name": {"type": "STRING"}, | |
| "material_abbreviation": {"type": "STRING"}, | |
| "mechanical_properties": { | |
| "type": "ARRAY", | |
| "items": { | |
| "type": "OBJECT", | |
| "properties": { | |
| "section": {"type": "STRING"}, | |
| "property_name": {"type": "STRING"}, | |
| "value": {"type": "STRING"}, | |
| "unit": {"type": "STRING"}, | |
| "english": {"type": "STRING"}, | |
| "test_condition": {"type": "STRING"}, | |
| "comments": {"type": "STRING"} | |
| }, | |
| "required": ["section", "property_name", "value", "english", "comments"] | |
| } | |
| } | |
| } | |
| } | |
| # === GEMINI CALL FUNCTION === | |
| def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]: | |
| """Calls Gemini API with PDF bytes""" | |
| try: | |
| encoded_file = base64.b64encode(pdf_bytes).decode("utf-8") | |
| mime_type = "application/pdf" | |
| except Exception as e: | |
| st.error(f"Error encoding PDF: {e}") | |
| return None | |
| prompt = ( | |
| "Extract all experimental data from this research paper. " | |
| "For each measurement, extract: " | |
| "- experiment_name, measured_value, unit, uncertainty, method, conditions. " | |
| "Return as JSON." | |
| # "You are an expert materials scientist. From the attached PDF, extract the material name, " | |
| # "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, " | |
| # "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). " | |
| # "For each property, you MUST extract:\n" | |
| # "- property_name\n- value (or range)\n- unit\n" | |
| # "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n" | |
| # "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n" | |
| # "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema." | |
| ) | |
| payload = { | |
| "contents": [ | |
| { | |
| "parts": [ | |
| {"text": prompt}, | |
| {"inlineData": {"mimeType": mime_type, "data": encoded_file}} | |
| ] | |
| } | |
| ], | |
| "generationConfig": { | |
| "temperature": 0, | |
| "responseMimeType": "application/json", | |
| "responseSchema": SCHEMA | |
| } | |
| } | |
| try: | |
| r = requests.post(API_URL, json=payload, timeout=300) | |
| r.raise_for_status() | |
| data = r.json() | |
| candidates = data.get("candidates", []) | |
| if not candidates: | |
| return None | |
| parts = candidates[0].get("content", {}).get("parts", []) | |
| json_text = None | |
| for p in parts: | |
| t = p.get("text", "") | |
| if t.strip().startswith("{"): | |
| json_text = t | |
| break | |
| return json.loads(json_text) if json_text else None | |
| except Exception as e: | |
| st.error(f"Gemini API Error: {e}") | |
| return None | |
| def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame: | |
| """Convert extracted JSON to DataFrame""" | |
| rows = [] | |
| for item in data.get("mechanical_properties", []): | |
| rows.append({ | |
| "material_name": data.get("material_name", ""), | |
| "material_abbreviation": data.get("material_abbreviation", ""), | |
| "section": item.get("section", ""), | |
| "property_name": item.get("property_name", ""), | |
| "value": item.get("value", ""), | |
| "unit": item.get("unit", ""), | |
| "english": item.get("english", ""), | |
| "test_condition": item.get("test_condition", ""), | |
| "comments": item.get("comments", "") | |
| }) | |
| return pd.DataFrame(rows) | |
| #using sentence transformers and semantic search techniques | |
| import sqlite3 | |
| import pandas as pd | |
| import os | |
| import requests | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # ========================== | |
| # CONFIGURATION | |
| # ========================== | |
| DB_PATH = "output_materials.db" | |
| EXCEL_PATH = "5.1__actual.xlsx" | |
| OUTPUT_EXCEL = "5.1__filled.xlsx" | |
| GEMINI_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") | |
| GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" | |
| # ========================== | |
| # GEMINI YES/NO MATCH CHECK | |
| # ========================== | |
| def gemini_same_property(excel_prop, db_prop): | |
| prompt = f""" | |
| You are an expert materials scientist. Determine if BOTH property names refer | |
| to the SAME mechanical property. | |
| Excel property: "{excel_prop}" | |
| Database property: "{db_prop}" | |
| Rules: | |
| - Compare meaning, not formatting. | |
| - Ignore units, values, and numbers. | |
| - If either refers to conditions, test setup, or non-property info, return NO. | |
| - Return ONLY YES or NO. | |
| """ | |
| payload = { | |
| "contents": [{"parts": [{"text": prompt}]}] | |
| } | |
| response = requests.post( | |
| GEMINI_URL, | |
| params={"key": GEMINI_KEY}, | |
| json=payload, | |
| timeout=60 | |
| ).json() | |
| try: | |
| ans = response["candidates"][0]["content"]["parts"][0]["text"].strip().upper() | |
| except: | |
| return False | |
| return ans == "YES" | |
| # ========================== | |
| # SEMANTIC MATCHER (fallback) | |
| # ========================== | |
| embed_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def semantic_match(excel_prop, df_section): | |
| if df_section.empty: | |
| return None | |
| # compute embeddings | |
| db_props = df_section["property_name"].tolist() | |
| db_vecs = embed_model.encode(db_props, convert_to_numpy=True) | |
| q_vec = embed_model.encode([excel_prop], convert_to_numpy=True) | |
| sims = cosine_similarity(q_vec, db_vecs)[0] | |
| df_section = df_section.copy() | |
| df_section["sim"] = sims | |
| df_section = df_section.sort_values("sim", ascending=False) | |
| # Take top-5 candidates for Gemini check | |
| top5 = df_section.head(5) | |
| for _, row in top5.iterrows(): | |
| cand = row["property_name"] | |
| if gemini_same_property(excel_prop, cand): | |
| return row | |
| return None | |
| # ========================== | |
| # MAIN PIPELINE | |
| # ========================== | |
| conn = sqlite3.connect(DB_PATH) | |
| # Get material tables | |
| tables = pd.read_sql_query( | |
| "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';", | |
| conn | |
| )["name"].tolist() | |
| print(f"Detected tables: {tables}") | |
| # Load Excel template once | |
| df_excel_template = pd.read_excel(EXCEL_PATH) | |
| cols = df_excel_template.columns.tolist() | |
| section_col = next((c for c in cols if "section" in c.lower()), None) | |
| prop_col = next((c for c in cols if "property" in c.lower()), cols[0]) | |
| print(f"Detected section column: {section_col}") | |
| print(f"Detected property column: {prop_col}") | |
| with pd.ExcelWriter(OUTPUT_EXCEL, engine="openpyxl") as writer: | |
| for table in tables: | |
| print(f"\nProcessing table: {table}") | |
| # Load DB table | |
| df_db = pd.read_sql_query(f""" | |
| SELECT section, property_name, value, unit, english, comments | |
| FROM '{table}' | |
| """, conn) | |
| df_excel = df_excel_template.copy() | |
| df_excel["Matched Property"] = "" | |
| df_excel["Value"] = "" | |
| df_excel["Unit"] = "" | |
| df_excel["English"] = "" | |
| df_excel["Comments"] = "" | |
| # Process each Excel property | |
| for i, row in df_excel.iterrows(): | |
| excel_prop = str(row[prop_col]).strip() | |
| excel_section = str(row.get(section_col, "")).strip().lower() | |
| if section_col: | |
| df_sec = df_db[df_db["section"].str.lower() == excel_section] | |
| else: | |
| df_sec = df_db | |
| # ========================== | |
| # 1️ EXACT MATCH | |
| # ========================== | |
| exact = df_sec[df_sec["property_name"].str.lower() == excel_prop.lower()] | |
| if not exact.empty: | |
| r = exact.iloc[0] | |
| df_excel.at[i, "Matched Property"] = r["property_name"] | |
| df_excel.at[i, "Value"] = r["value"] | |
| df_excel.at[i, "Unit"] = r["unit"] | |
| df_excel.at[i, "English"] = r["english"] | |
| df_excel.at[i, "Comments"] = r["comments"] | |
| continue # done | |
| # ========================== | |
| # 2️ SEMANTIC + GEMINI MATCH | |
| # ========================== | |
| best = semantic_match(excel_prop, df_sec) | |
| if best is not None: | |
| df_excel.at[i, "Matched Property"] = best["property_name"] | |
| df_excel.at[i, "Value"] = best["value"] | |
| df_excel.at[i, "Unit"] = best["unit"] | |
| df_excel.at[i, "English"] = best["english"] | |
| df_excel.at[i, "Comments"] = best["comments"] | |
| else: | |
| df_excel.at[i, "Matched Property"] = "" | |
| # Write one sheet per material | |
| df_excel.to_excel(writer, sheet_name=table[:31], index=False) | |
| print(f"\nDONE → Final filled Excel: {OUTPUT_EXCEL}") | |
| conn.close() | |