Spaces:

aim4composites
/

MaterialsDatabase

Running

App Files Files Community

MaterialsDatabase / page_files /categorized /Backend /Pdf_DataExtraction.py

AbhijitClemson

Upload folder using huggingface_hub

1adc2e7 verified about 1 month ago

raw

history blame

9.9 kB

	import streamlit as st
	import pandas as pd
	from PIL import Image
	import requests
	import base64
	import json
	import os
	from typing import Dict, Any, Optional




	# Backend PDF extraction Logic
	API_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
	API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"

	SCHEMA = {
	"type": "OBJECT",
	"properties": {
	"material_name": {"type": "STRING"},
	"material_abbreviation": {"type": "STRING"},
	"mechanical_properties": {
	"type": "ARRAY",
	"items": {
	"type": "OBJECT",
	"properties": {
	"section": {"type": "STRING"},
	"property_name": {"type": "STRING"},
	"value": {"type": "STRING"},
	"unit": {"type": "STRING"},
	"english": {"type": "STRING"},
	"test_condition": {"type": "STRING"},
	"comments": {"type": "STRING"}
	},
	"required": ["section", "property_name", "value", "english", "comments"]
	}
	}
	}
	}

	# === GEMINI CALL FUNCTION ===
	def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
	"""Calls Gemini API with PDF bytes"""
	try:
	encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
	mime_type = "application/pdf"
	except Exception as e:
	st.error(f"Error encoding PDF: {e}")
	return None

	prompt = (
	"Extract all experimental data from this research paper. "
	"For each measurement, extract: "
	"- experiment_name, measured_value, unit, uncertainty, method, conditions. "
	"Return as JSON."
	# "You are an expert materials scientist. From the attached PDF, extract the material name, "
	# "abbreviation, and ALL properties across categories (Mechanical, Thermal, Electrical, Physical, "
	# "Optical, Rheological, etc.). Return them as 'mechanical_properties' (a single list). "
	# "For each property, you MUST extract:\n"
	# "- property_name\n- value (or range)\n- unit\n"
	# "- english (converted or alternate units, e.g., psi, °F, inches; write '' if not provided)\n"
	# "- test_condition\n- comments (include any notes, footnotes, standards, remarks; write '' if none)\n"
	# "All fields including english and comments are REQUIRED. Respond ONLY with valid JSON following the schema."
	)

	payload = {
	"contents": [
	{
	"parts": [
	{"text": prompt},
	{"inlineData": {"mimeType": mime_type, "data": encoded_file}}
	]
	}
	],
	"generationConfig": {
	"temperature": 0,
	"responseMimeType": "application/json",
	"responseSchema": SCHEMA
	}
	}

	try:
	r = requests.post(API_URL, json=payload, timeout=300)
	r.raise_for_status()
	data = r.json()

	candidates = data.get("candidates", [])
	if not candidates:
	return None

	parts = candidates[0].get("content", {}).get("parts", [])
	json_text = None
	for p in parts:
	t = p.get("text", "")
	if t.strip().startswith("{"):
	json_text = t
	break

	return json.loads(json_text) if json_text else None
	except Exception as e:
	st.error(f"Gemini API Error: {e}")
	return None


	def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
	"""Convert extracted JSON to DataFrame"""
	rows = []
	for item in data.get("mechanical_properties", []):
	rows.append({
	"material_name": data.get("material_name", ""),
	"material_abbreviation": data.get("material_abbreviation", ""),
	"section": item.get("section", ""),
	"property_name": item.get("property_name", ""),
	"value": item.get("value", ""),
	"unit": item.get("unit", ""),
	"english": item.get("english", ""),
	"test_condition": item.get("test_condition", ""),
	"comments": item.get("comments", "")
	})
	return pd.DataFrame(rows)



	#using sentence transformers and semantic search techniques
	import sqlite3
	import pandas as pd
	import os
	import requests
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity

	# ==========================
	# CONFIGURATION
	# ==========================
	DB_PATH = "output_materials.db"
	EXCEL_PATH = "5.1__actual.xlsx"
	OUTPUT_EXCEL = "5.1__filled.xlsx"
	GEMINI_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")

	GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"


	# ==========================
	# GEMINI YES/NO MATCH CHECK
	# ==========================
	def gemini_same_property(excel_prop, db_prop):
	prompt = f"""
	You are an expert materials scientist. Determine if BOTH property names refer
	to the SAME mechanical property.

	Excel property: "{excel_prop}"
	Database property: "{db_prop}"

	Rules:
	- Compare meaning, not formatting.
	- Ignore units, values, and numbers.
	- If either refers to conditions, test setup, or non-property info, return NO.
	- Return ONLY YES or NO.
	"""

	payload = {
	"contents": [{"parts": [{"text": prompt}]}]
	}

	response = requests.post(
	GEMINI_URL,
	params={"key": GEMINI_KEY},
	json=payload,
	timeout=60
	).json()

	try:
	ans = response["candidates"][0]["content"]["parts"][0]["text"].strip().upper()
	except:
	return False

	return ans == "YES"


	# ==========================
	# SEMANTIC MATCHER (fallback)
	# ==========================
	embed_model = SentenceTransformer("all-MiniLM-L6-v2")

	def semantic_match(excel_prop, df_section):
	if df_section.empty:
	return None

	# compute embeddings
	db_props = df_section["property_name"].tolist()
	db_vecs = embed_model.encode(db_props, convert_to_numpy=True)
	q_vec = embed_model.encode([excel_prop], convert_to_numpy=True)

	sims = cosine_similarity(q_vec, db_vecs)[0]

	df_section = df_section.copy()
	df_section["sim"] = sims
	df_section = df_section.sort_values("sim", ascending=False)

	# Take top-5 candidates for Gemini check
	top5 = df_section.head(5)

	for _, row in top5.iterrows():
	cand = row["property_name"]
	if gemini_same_property(excel_prop, cand):
	return row

	return None


	# ==========================
	# MAIN PIPELINE
	# ==========================
	conn = sqlite3.connect(DB_PATH)

	# Get material tables
	tables = pd.read_sql_query(
	"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';",
	conn
	)["name"].tolist()

	print(f"Detected tables: {tables}")

	# Load Excel template once
	df_excel_template = pd.read_excel(EXCEL_PATH)
	cols = df_excel_template.columns.tolist()

	section_col = next((c for c in cols if "section" in c.lower()), None)
	prop_col = next((c for c in cols if "property" in c.lower()), cols[0])

	print(f"Detected section column: {section_col}")
	print(f"Detected property column: {prop_col}")

	with pd.ExcelWriter(OUTPUT_EXCEL, engine="openpyxl") as writer:

	for table in tables:
	print(f"\nProcessing table: {table}")

	# Load DB table
	df_db = pd.read_sql_query(f"""
	SELECT section, property_name, value, unit, english, comments
	FROM '{table}'
	""", conn)

	df_excel = df_excel_template.copy()
	df_excel["Matched Property"] = ""
	df_excel["Value"] = ""
	df_excel["Unit"] = ""
	df_excel["English"] = ""
	df_excel["Comments"] = ""

	# Process each Excel property
	for i, row in df_excel.iterrows():
	excel_prop = str(row[prop_col]).strip()
	excel_section = str(row.get(section_col, "")).strip().lower()


	if section_col:
	df_sec = df_db[df_db["section"].str.lower() == excel_section]
	else:
	df_sec = df_db

	# ==========================
	# 1️ EXACT MATCH
	# ==========================
	exact = df_sec[df_sec["property_name"].str.lower() == excel_prop.lower()]

	if not exact.empty:
	r = exact.iloc[0]
	df_excel.at[i, "Matched Property"] = r["property_name"]
	df_excel.at[i, "Value"] = r["value"]
	df_excel.at[i, "Unit"] = r["unit"]
	df_excel.at[i, "English"] = r["english"]
	df_excel.at[i, "Comments"] = r["comments"]
	continue # done

	# ==========================
	# 2️ SEMANTIC + GEMINI MATCH
	# ==========================
	best = semantic_match(excel_prop, df_sec)

	if best is not None:
	df_excel.at[i, "Matched Property"] = best["property_name"]
	df_excel.at[i, "Value"] = best["value"]
	df_excel.at[i, "Unit"] = best["unit"]
	df_excel.at[i, "English"] = best["english"]
	df_excel.at[i, "Comments"] = best["comments"]
	else:
	df_excel.at[i, "Matched Property"] = ""

	# Write one sheet per material
	df_excel.to_excel(writer, sheet_name=table[:31], index=False)

	print(f"\nDONE → Final filled Excel: {OUTPUT_EXCEL}")
	conn.close()