AbhijitClemson commited on
Commit
6586bef
Β·
verified Β·
1 Parent(s): 3c0a6cd

Update page_files/Upload_Data.py

Browse files
Files changed (1) hide show
  1. page_files/Upload_Data.py +9 -21
page_files/Upload_Data.py CHANGED
@@ -27,9 +27,7 @@ from PIL import Image
27
  from dotenv import load_dotenv
28
  load_dotenv()
29
 
30
- _GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
31
- if not _GEMINI_API_KEY:
32
- raise RuntimeError("GEMINI_API_KEY not set in environment")
33
 
34
  # ── imports from doctodb_rag (data extraction) ────────────────────────────────
35
  from categorized.Backend.PDF_DataExtraction import run_pipeline
@@ -163,18 +161,17 @@ def save_single_image_with_property(
163
  # expected by the rest of the UI (list of {caption, page, image_data}).
164
  # ─────────────────────────────────────────────────────────────────────────────
165
 
166
- _GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyBzyMFKEqcjsWpR-OGAY42T250o1O39v3Y")
167
 
168
  def extract_images(pdf_path: str) -> list:
169
- """
170
- Use figure_extractor to detect and crop plot images from a PDF path.
171
- Returns a list compatible with the image_results shape used throughout the UI:
172
- [{ "caption": str, "page": int, "image_data": [{"array": bgr_ndarray, "filename": str}] }]
173
- """
174
  try:
175
- # gemini_model = init_gemini(_GEMINI_API_KEY)
176
- plot_data = get_plot_data_from_llm( GEMINI_MODEL, pdf_path)
177
- raw_plots = extract_plots(
 
 
 
 
 
178
  pdf_path=pdf_path,
179
  plot_data=plot_data,
180
  pad=22,
@@ -184,32 +181,23 @@ def extract_images(pdf_path: str) -> list:
184
  log.error(f"extract_images failed: {e}")
185
  return []
186
 
187
-
188
-
189
-
190
- # raw_plots items: {caption, page, path, plot_score, plot_type}
191
- # Convert to image_results shape
192
  image_results = []
193
  for item in raw_plots:
194
  bgr = cv2.imread(item["path"]) if item.get("path") else None
195
- # clean up temp file written by extract_plots
196
  if item.get("path") and os.path.exists(item["path"]):
197
  try:
198
  os.remove(item["path"])
199
  except Exception:
200
  pass
201
-
202
  page = item.get("page", 1)
203
  caption = item.get("caption", f"Figure (page {page})")
204
  safe = re.sub(r"[^\w\-]", "_", caption)[:40]
205
  filename = f"page{page}_{safe}.png"
206
-
207
  image_results.append({
208
  "caption": caption,
209
  "page": page,
210
  "image_data": [{"array": bgr, "filename": filename}] if bgr is not None else [],
211
  })
212
-
213
  return image_results
214
 
215
 
 
27
  from dotenv import load_dotenv
28
  load_dotenv()
29
 
30
+ _GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
 
 
31
 
32
  # ── imports from doctodb_rag (data extraction) ────────────────────────────────
33
  from categorized.Backend.PDF_DataExtraction import run_pipeline
 
161
  # expected by the rest of the UI (list of {caption, page, image_data}).
162
  # ─────────────────────────────────────────────────────────────────────────────
163
 
 
164
 
165
  def extract_images(pdf_path: str) -> list:
 
 
 
 
 
166
  try:
167
+ from categorized.Backend.Pdf_ImageExtraction import get_available_model
168
+ import google.generativeai as genai
169
+ api_key = os.getenv("GEMINI_API_KEY", "")
170
+ genai.configure(api_key=api_key)
171
+ model_name = get_available_model(api_key)
172
+ active_model = genai.GenerativeModel(model_name)
173
+ plot_data = get_plot_data_from_llm(active_model, pdf_path)
174
+ raw_plots = extract_plots(
175
  pdf_path=pdf_path,
176
  plot_data=plot_data,
177
  pad=22,
 
181
  log.error(f"extract_images failed: {e}")
182
  return []
183
 
 
 
 
 
 
184
  image_results = []
185
  for item in raw_plots:
186
  bgr = cv2.imread(item["path"]) if item.get("path") else None
 
187
  if item.get("path") and os.path.exists(item["path"]):
188
  try:
189
  os.remove(item["path"])
190
  except Exception:
191
  pass
 
192
  page = item.get("page", 1)
193
  caption = item.get("caption", f"Figure (page {page})")
194
  safe = re.sub(r"[^\w\-]", "_", caption)[:40]
195
  filename = f"page{page}_{safe}.png"
 
196
  image_results.append({
197
  "caption": caption,
198
  "page": page,
199
  "image_data": [{"array": bgr, "filename": filename}] if bgr is not None else [],
200
  })
 
201
  return image_results
202
 
203