AbhijitClemson commited on
Commit
52bab2b
·
verified ·
1 Parent(s): 2b4fb27

Update src/pages/categorized/page6.py

Browse files
Files changed (1) hide show
  1. src/pages/categorized/page6.py +21 -13
src/pages/categorized/page6.py CHANGED
@@ -14,13 +14,15 @@ import requests
14
  import base64
15
  from typing import Dict, Any, Optional
16
  from collections import defaultdict
 
17
 
 
18
  API_KEY = os.environ.get("GEMINI_API_KEY")
19
  if not API_KEY:
20
  st.error("Gemini API key not found. Set GEMINI_API_KEY in Hugging Face Secrets.")
21
  st.stop()
22
 
23
- API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={API_KEY}"
24
 
25
  SCHEMA = {
26
  "type": "OBJECT",
@@ -60,6 +62,10 @@ CAP_RE = re.compile(r"^(Fig\.?\s*\d+|Figure\s*\d+)\b", re.IGNORECASE)
60
  def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
61
  """Calls Gemini API with PDF bytes"""
62
  try:
 
 
 
 
63
  encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
64
  mime_type = "application/pdf"
65
  except Exception as e:
@@ -140,7 +146,6 @@ def convert_to_dataframe(data: Dict[str, Any]) -> pd.DataFrame:
140
  })
141
  return pd.DataFrame(rows)
142
 
143
- # --- IMAGE EXTRACTION LOGIC ---
144
  def get_page_image(page):
145
  pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
146
  img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
@@ -210,7 +215,6 @@ def extract_images(pdf_doc):
210
  x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
211
  crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
212
 
213
- # Store image data in memory instead of saving to disk
214
  _, buffer = cv2.imencode('.png', crop)
215
  img_bytes = buffer.tobytes()
216
 
@@ -461,7 +465,10 @@ def main():
461
  st.title("PDF Material Data & Plot Extractor")
462
 
463
  uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
464
-
 
 
 
465
  if not uploaded_file:
466
 
467
  st.info("Upload a PDF to extract material data and plots")
@@ -495,22 +502,24 @@ def main():
495
 
496
  tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
497
 
498
- with tempfile.TemporaryDirectory() as tmpdir:
499
- pdf_path = os.path.join(tmpdir, uploaded_file.name)
500
- with open(pdf_path, "wb") as f:
501
- f.write(uploaded_file.getbuffer())
502
-
 
 
503
  with tab1:
504
  st.subheader("Material Properties Data")
505
 
506
- # Only call Gemini once per PDF
507
  if not st.session_state.pdf_data_extracted:
508
- with st.spinner(" Extracting material data..."):
509
  with open(pdf_path, "rb") as f:
510
  pdf_bytes = f.read()
511
 
512
  data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
513
 
 
514
  if data:
515
  df = convert_to_dataframe(data)
516
  if not df.empty:
@@ -521,7 +530,6 @@ def main():
521
  st.warning("No data extracted")
522
  else:
523
  st.error("Failed to extract data from PDF")
524
- # After extraction, or when rerunning, use stored data
525
  df = st.session_state.pdf_extracted_df
526
 
527
  if not df.empty:
@@ -649,7 +657,7 @@ def main():
649
 
650
  img_data = st.session_state.image_results[idx]['image_data'][p_idx]
651
  with cols[p_idx]:
652
- st.image(img_data['array'], width=img_width, channels="BGR")
653
  if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
654
  del st.session_state.image_results[idx]['image_data'][p_idx]
655
  if len(st.session_state.image_results[idx]['image_data']) == 0:
 
14
  import base64
15
  from typing import Dict, Any, Optional
16
  from collections import defaultdict
17
+ import google.generativeai as genai
18
 
19
+ genai.configure(api_key=os.environ["GEMINI_API_KEY"])
20
  API_KEY = os.environ.get("GEMINI_API_KEY")
21
  if not API_KEY:
22
  st.error("Gemini API key not found. Set GEMINI_API_KEY in Hugging Face Secrets.")
23
  st.stop()
24
 
25
+ API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-preview-09-2025:generateContent?key={API_KEY}"
26
 
27
  SCHEMA = {
28
  "type": "OBJECT",
 
62
  def call_gemini_from_bytes(pdf_bytes: bytes, filename: str) -> Optional[Dict[str, Any]]:
63
  """Calls Gemini API with PDF bytes"""
64
  try:
65
+ if len(pdf_bytes) > 3 * 1024 * 1024:
66
+ st.error("PDF too large for Gemini demo on Hugging Face (max ~3MB).")
67
+ return None
68
+
69
  encoded_file = base64.b64encode(pdf_bytes).decode("utf-8")
70
  mime_type = "application/pdf"
71
  except Exception as e:
 
146
  })
147
  return pd.DataFrame(rows)
148
 
 
149
  def get_page_image(page):
150
  pix = page.get_pixmap(matrix=fitz.Matrix(DPI/72, DPI/72))
151
  img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, 3)
 
215
  x2, y2 = min(page_w, cx + cw + PADDING), min(page_h, cy + ch + PADDING)
216
  crop = img_bgr[int(y1):int(y2), int(x1):int(x2)]
217
 
 
218
  _, buffer = cv2.imencode('.png', crop)
219
  img_bytes = buffer.tobytes()
220
 
 
465
  st.title("PDF Material Data & Plot Extractor")
466
 
467
  uploaded_file = st.file_uploader("Upload PDF (Material Datasheet or Research Paper)", type=["pdf"])
468
+ if uploaded_file is not None:
469
+ if uploaded_file.size > 10 * 1024 * 1024:
470
+ st.error("PDF too large (max 10MB for demo)")
471
+ st.stop()
472
  if not uploaded_file:
473
 
474
  st.info("Upload a PDF to extract material data and plots")
 
502
 
503
  tab1, tab2 = st.tabs([" Material Data", " Extracted Plots"])
504
 
505
+ #with tempfile.TemporaryDirectory() as tmpdir:
506
+ # pdf_path = os.path.join(tmpdir, uploaded_file.name)
507
+ # with open(pdf_path, "wb") as f:
508
+ # f.write(uploaded_file.getbuffer())
509
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
510
+ tmp.write(uploaded_file.read())
511
+ pdf_path = tmp.name
512
  with tab1:
513
  st.subheader("Material Properties Data")
514
 
 
515
  if not st.session_state.pdf_data_extracted:
516
+ with st.spinner("Extracting material data from PDF…"):
517
  with open(pdf_path, "rb") as f:
518
  pdf_bytes = f.read()
519
 
520
  data = call_gemini_from_bytes(pdf_bytes, uploaded_file.name)
521
 
522
+
523
  if data:
524
  df = convert_to_dataframe(data)
525
  if not df.empty:
 
530
  st.warning("No data extracted")
531
  else:
532
  st.error("Failed to extract data from PDF")
 
533
  df = st.session_state.pdf_extracted_df
534
 
535
  if not df.empty:
 
657
 
658
  img_data = st.session_state.image_results[idx]['image_data'][p_idx]
659
  with cols[p_idx]:
660
+ st.image(img_data['array'], use_container_width=True, channels="BGR")
661
  if st.button(" Remove", key=f"del_s_{idx}_{p_idx}_{r['page']}"):
662
  del st.session_state.image_results[idx]['image_data'][p_idx]
663
  if len(st.session_state.image_results[idx]['image_data']) == 0: