Spaces:

aim4composites
/

MaterialsDatabase

Running

App Files Files Community

AbhijitClemson commited on 2 days ago

Commit

6586bef

verified ·

1 Parent(s): 3c0a6cd

Update page_files/Upload_Data.py

Browse files

Files changed (1) hide show

page_files/Upload_Data.py +9 -21

page_files/Upload_Data.py CHANGED Viewed

@@ -27,9 +27,7 @@ from PIL import Image
 from dotenv import load_dotenv
 load_dotenv()
-_GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-if not _GEMINI_API_KEY:
-    raise RuntimeError("GEMINI_API_KEY not set in environment")
 # ── imports from doctodb_rag (data extraction) ────────────────────────────────
 from categorized.Backend.PDF_DataExtraction import run_pipeline
@@ -163,18 +161,17 @@ def save_single_image_with_property(
 # expected by the rest of the UI (list of {caption, page, image_data}).
 # ─────────────────────────────────────────────────────────────────────────────
-_GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyBzyMFKEqcjsWpR-OGAY42T250o1O39v3Y")
 def extract_images(pdf_path: str) -> list:
-    """
-    Use figure_extractor to detect and crop plot images from a PDF path.
-    Returns a list compatible with the image_results shape used throughout the UI:
-      [{ "caption": str, "page": int, "image_data": [{"array": bgr_ndarray, "filename": str}] }]
-    """
     try:
-        # gemini_model = init_gemini(_GEMINI_API_KEY)
-        plot_data    = get_plot_data_from_llm( GEMINI_MODEL, pdf_path)
-        raw_plots    = extract_plots(
             pdf_path=pdf_path,
             plot_data=plot_data,
             pad=22,
@@ -184,32 +181,23 @@ def extract_images(pdf_path: str) -> list:
         log.error(f"extract_images failed: {e}")
         return []
-    # raw_plots items: {caption, page, path, plot_score, plot_type}
-    # Convert to image_results shape
     image_results = []
     for item in raw_plots:
         bgr = cv2.imread(item["path"]) if item.get("path") else None
-        # clean up temp file written by extract_plots
         if item.get("path") and os.path.exists(item["path"]):
             try:
                 os.remove(item["path"])
             except Exception:
                 pass
         page    = item.get("page", 1)
         caption = item.get("caption", f"Figure (page {page})")
         safe    = re.sub(r"[^\w\-]", "_", caption)[:40]
         filename = f"page{page}_{safe}.png"
         image_results.append({
             "caption":    caption,
             "page":       page,
             "image_data": [{"array": bgr, "filename": filename}] if bgr is not None else [],
         })
     return image_results

 from dotenv import load_dotenv
 load_dotenv()
+_GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 # ── imports from doctodb_rag (data extraction) ────────────────────────────────
 from categorized.Backend.PDF_DataExtraction import run_pipeline
 # expected by the rest of the UI (list of {caption, page, image_data}).
 # ─────────────────────────────────────────────────────────────────────────────
 def extract_images(pdf_path: str) -> list:
     try:
+        from categorized.Backend.Pdf_ImageExtraction import get_available_model
+        import google.generativeai as genai
+        api_key = os.getenv("GEMINI_API_KEY", "")
+        genai.configure(api_key=api_key)
+        model_name = get_available_model(api_key)
+        active_model = genai.GenerativeModel(model_name)
+        plot_data = get_plot_data_from_llm(active_model, pdf_path)
+        raw_plots = extract_plots(
             pdf_path=pdf_path,
             plot_data=plot_data,
             pad=22,
         log.error(f"extract_images failed: {e}")
         return []
     image_results = []
     for item in raw_plots:
         bgr = cv2.imread(item["path"]) if item.get("path") else None
         if item.get("path") and os.path.exists(item["path"]):
             try:
                 os.remove(item["path"])
             except Exception:
                 pass
         page    = item.get("page", 1)
         caption = item.get("caption", f"Figure (page {page})")
         safe    = re.sub(r"[^\w\-]", "_", caption)[:40]
         filename = f"page{page}_{safe}.png"
         image_results.append({
             "caption":    caption,
             "page":       page,
             "image_data": [{"array": bgr, "filename": filename}] if bgr is not None else [],
         })
     return image_results