Spaces:
Sleeping
Sleeping
Update src/pages/categorized/page6.py
Browse files- src/pages/categorized/page6.py +21 -13
src/pages/categorized/page6.py
CHANGED
|
@@ -14,13 +14,15 @@ import requests
|
|
| 14 |
import base64
|
| 15 |
from typing import Dict, Any, Optional
|
| 16 |
from collections import defaultdict
|
|
|
|
| 17 |
|
|
|
|
| 18 |
API_KEY = os.environ.get("GEMINI_API_KEY")
|
| 19 |
if not API_KEY:
|
| 20 |
st.error("Gemini API key not found. Set GEMINI_API_KEY in Hugging Face Secrets.")
|
| 21 |
st.stop()
|
| 22 |
|
| 23 |
-
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-
|
| 24 |
|
| 25 |
SCHEMA = {
|
| 26 |
"type": "OBJECT",
|
|
@@ -60,6 +62,10 @@ CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
|
|
| 60 |
def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
|
| 61 |
"""Calls Gemini API with PDF bytes"""
|
| 62 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
|
| 64 |
mime_type = "application/pdf"
|
| 65 |
except Exception as e:
|
|
@@ -140,7 +146,6 @@ def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
|
|
| 140 |
})
|
| 141 |
return pd.DataFrame(rows)
|
| 142 |
|
| 143 |
-
# --- IMAGE EXTRACTION LOGIC ---
|
| 144 |
def get_page_image(page):
|
| 145 |
pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
|
| 146 |
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
|
|
@@ -210,7 +215,6 @@ def extract_images(pdf_doc):
|
|
| 210 |
x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
|
| 211 |
crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
|
| 212 |
|
| 213 |
-
# Store image data in memory instead of saving to disk
|
| 214 |
_, buffer = cv2.imencode('.png', crop)
|
| 215 |
img_bytes = buffer.tobytes()
|
| 216 |
|
|
@@ -461,7 +465,10 @@ def main():
|
|
| 461 |
st.title("PDF Material Data & Plot Extractor")
|
| 462 |
|
| 463 |
uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
| 465 |
if not uploaded_file:
|
| 466 |
|
| 467 |
st.info("Upload a PDF to extract material data and plots")
|
|
@@ -495,22 +502,24 @@ def main():
|
|
| 495 |
|
| 496 |
tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
|
| 497 |
|
| 498 |
-
with tempfile.TemporaryDirectory() as tmpdir:
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
|
|
|
|
|
|
| 503 |
with tab1:
|
| 504 |
st.subheader("Material Properties Data")
|
| 505 |
|
| 506 |
-
# Only call Gemini once per PDF
|
| 507 |
if not st.session_state.pdf_data_extracted:
|
| 508 |
-
with st.spinner("
|
| 509 |
with open(pdf_path, "rb") as f:
|
| 510 |
pdf_bytes = f.read()
|
| 511 |
|
| 512 |
data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
|
| 513 |
|
|
|
|
| 514 |
if data:
|
| 515 |
df = convert_to_dataframe(data)
|
| 516 |
if not df.empty:
|
|
@@ -521,7 +530,6 @@ def main():
|
|
| 521 |
st.warning("No data extracted")
|
| 522 |
else:
|
| 523 |
st.error("Failed to extract data from PDF")
|
| 524 |
-
# After extraction, or when rerunning, use stored data
|
| 525 |
df = st.session_state.pdf_extracted_df
|
| 526 |
|
| 527 |
if not df.empty:
|
|
@@ -649,7 +657,7 @@ def main():
|
|
| 649 |
|
| 650 |
img_data = st.session_state.image_results[idx]['image_data'][p_idx]
|
| 651 |
with cols[p_idx]:
|
| 652 |
-
st.image(img_data['array'],
|
| 653 |
if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
|
| 654 |
del st.session_state.image_results[idx]['image_data'][p_idx]
|
| 655 |
if len(st.session_state.image_results[idx]['image_data']) == 0:
|
|
|
|
| 14 |
import base64
|
| 15 |
from typing import Dict, Any, Optional
|
| 16 |
from collections import defaultdict
|
| 17 |
+
import google.generativeai as genai
|
| 18 |
|
| 19 |
+
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
| 20 |
API_KEY = os.environ.get("GEMINI_API_KEY")
|
| 21 |
if not API_KEY:
|
| 22 |
st.error("Gemini API key not found. Set GEMINI_API_KEY in Hugging Face Secrets.")
|
| 23 |
st.stop()
|
| 24 |
|
| 25 |
+
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-preview-09-2025:generateContent?key={API_KEY}"
|
| 26 |
|
| 27 |
SCHEMA = {
|
| 28 |
"type": "OBJECT",
|
|
|
|
| 62 |
def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
|
| 63 |
"""Calls Gemini API with PDF bytes"""
|
| 64 |
try:
|
| 65 |
+
if len(pdf_bytes) > 3 * 1024 * 1024:
|
| 66 |
+
st.error("PDF too large for Gemini demo on Hugging Face (max ~3MB).")
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
|
| 70 |
mime_type = "application/pdf"
|
| 71 |
except Exception as e:
|
|
|
|
| 146 |
})
|
| 147 |
return pd.DataFrame(rows)
|
| 148 |
|
|
|
|
| 149 |
def get_page_image(page):
|
| 150 |
pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
|
| 151 |
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
|
|
|
|
| 215 |
x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
|
| 216 |
crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
|
| 217 |
|
|
|
|
| 218 |
_, buffer = cv2.imencode('.png', crop)
|
| 219 |
img_bytes = buffer.tobytes()
|
| 220 |
|
|
|
|
| 465 |
st.title("PDF Material Data & Plot Extractor")
|
| 466 |
|
| 467 |
uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
|
| 468 |
+
if uploaded_file is not None:
|
| 469 |
+
if uploaded_file.size > 10 * 1024 * 1024:
|
| 470 |
+
st.error("PDF too large (max 10MB for demo)")
|
| 471 |
+
st.stop()
|
| 472 |
if not uploaded_file:
|
| 473 |
|
| 474 |
st.info("Upload a PDF to extract material data and plots")
|
|
|
|
| 502 |
|
| 503 |
tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
|
| 504 |
|
| 505 |
+
#with tempfile.TemporaryDirectory() as tmpdir:
|
| 506 |
+
# pdf_path = os.path.join(tmpdir, uploaded_file.name)
|
| 507 |
+
# with open(pdf_path, "wb") as f:
|
| 508 |
+
# f.write(uploaded_file.getbuffer())
|
| 509 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
| 510 |
+
tmp.write(uploaded_file.read())
|
| 511 |
+
pdf_path = tmp.name
|
| 512 |
with tab1:
|
| 513 |
st.subheader("Material Properties Data")
|
| 514 |
|
|
|
|
| 515 |
if not st.session_state.pdf_data_extracted:
|
| 516 |
+
with st.spinner("Extracting material data from PDF…"):
|
| 517 |
with open(pdf_path, "rb") as f:
|
| 518 |
pdf_bytes = f.read()
|
| 519 |
|
| 520 |
data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
|
| 521 |
|
| 522 |
+
|
| 523 |
if data:
|
| 524 |
df = convert_to_dataframe(data)
|
| 525 |
if not df.empty:
|
|
|
|
| 530 |
st.warning("No data extracted")
|
| 531 |
else:
|
| 532 |
st.error("Failed to extract data from PDF")
|
|
|
|
| 533 |
df = st.session_state.pdf_extracted_df
|
| 534 |
|
| 535 |
if not df.empty:
|
|
|
|
| 657 |
|
| 658 |
img_data = st.session_state.image_results[idx]['image_data'][p_idx]
|
| 659 |
with cols[p_idx]:
|
| 660 |
+
st.image(img_data['array'], use_container_width=True, channels="BGR")
|
| 661 |
if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
|
| 662 |
del st.session_state.image_results[idx]['image_data'][p_idx]
|
| 663 |
if len(st.session_state.image_results[idx]['image_data']) == 0:
|