Update page_files/Upload_Data.py
Browse files- page_files/Upload_Data.py +9 -21
page_files/Upload_Data.py
CHANGED
|
@@ -27,9 +27,7 @@ from PIL import Image
|
|
| 27 |
from dotenv import load_dotenv
|
| 28 |
load_dotenv()
|
| 29 |
|
| 30 |
-
_GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 31 |
-
if not _GEMINI_API_KEY:
|
| 32 |
-
raise RuntimeError("GEMINI_API_KEY not set in environment")
|
| 33 |
|
| 34 |
# ββ imports from doctodb_rag (data extraction) ββββββββββββββββββββββββββββββββ
|
| 35 |
from categorized.Backend.PDF_DataExtraction import run_pipeline
|
|
@@ -163,18 +161,17 @@ def save_single_image_with_property(
|
|
| 163 |
# expected by the rest of the UI (list of {caption, page, image_data}).
|
| 164 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 165 |
|
| 166 |
-
_GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyBzyMFKEqcjsWpR-OGAY42T250o1O39v3Y")
|
| 167 |
|
| 168 |
def extract_images(pdf_path: str) -> list:
|
| 169 |
-
"""
|
| 170 |
-
Use figure_extractor to detect and crop plot images from a PDF path.
|
| 171 |
-
Returns a list compatible with the image_results shape used throughout the UI:
|
| 172 |
-
[{ "caption": str, "page": int, "image_data": [{"array": bgr_ndarray, "filename": str}] }]
|
| 173 |
-
"""
|
| 174 |
try:
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
pdf_path=pdf_path,
|
| 179 |
plot_data=plot_data,
|
| 180 |
pad=22,
|
|
@@ -184,32 +181,23 @@ def extract_images(pdf_path: str) -> list:
|
|
| 184 |
log.error(f"extract_images failed: {e}")
|
| 185 |
return []
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
# raw_plots items: {caption, page, path, plot_score, plot_type}
|
| 191 |
-
# Convert to image_results shape
|
| 192 |
image_results = []
|
| 193 |
for item in raw_plots:
|
| 194 |
bgr = cv2.imread(item["path"]) if item.get("path") else None
|
| 195 |
-
# clean up temp file written by extract_plots
|
| 196 |
if item.get("path") and os.path.exists(item["path"]):
|
| 197 |
try:
|
| 198 |
os.remove(item["path"])
|
| 199 |
except Exception:
|
| 200 |
pass
|
| 201 |
-
|
| 202 |
page = item.get("page", 1)
|
| 203 |
caption = item.get("caption", f"Figure (page {page})")
|
| 204 |
safe = re.sub(r"[^\w\-]", "_", caption)[:40]
|
| 205 |
filename = f"page{page}_{safe}.png"
|
| 206 |
-
|
| 207 |
image_results.append({
|
| 208 |
"caption": caption,
|
| 209 |
"page": page,
|
| 210 |
"image_data": [{"array": bgr, "filename": filename}] if bgr is not None else [],
|
| 211 |
})
|
| 212 |
-
|
| 213 |
return image_results
|
| 214 |
|
| 215 |
|
|
|
|
| 27 |
from dotenv import load_dotenv
|
| 28 |
load_dotenv()
|
| 29 |
|
| 30 |
+
_GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# ββ imports from doctodb_rag (data extraction) ββββββββββββββββββββββββββββββββ
|
| 33 |
from categorized.Backend.PDF_DataExtraction import run_pipeline
|
|
|
|
| 161 |
# expected by the rest of the UI (list of {caption, page, image_data}).
|
| 162 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 163 |
|
|
|
|
| 164 |
|
| 165 |
def extract_images(pdf_path: str) -> list:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
try:
|
| 167 |
+
from categorized.Backend.Pdf_ImageExtraction import get_available_model
|
| 168 |
+
import google.generativeai as genai
|
| 169 |
+
api_key = os.getenv("GEMINI_API_KEY", "")
|
| 170 |
+
genai.configure(api_key=api_key)
|
| 171 |
+
model_name = get_available_model(api_key)
|
| 172 |
+
active_model = genai.GenerativeModel(model_name)
|
| 173 |
+
plot_data = get_plot_data_from_llm(active_model, pdf_path)
|
| 174 |
+
raw_plots = extract_plots(
|
| 175 |
pdf_path=pdf_path,
|
| 176 |
plot_data=plot_data,
|
| 177 |
pad=22,
|
|
|
|
| 181 |
log.error(f"extract_images failed: {e}")
|
| 182 |
return []
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
image_results = []
|
| 185 |
for item in raw_plots:
|
| 186 |
bgr = cv2.imread(item["path"]) if item.get("path") else None
|
|
|
|
| 187 |
if item.get("path") and os.path.exists(item["path"]):
|
| 188 |
try:
|
| 189 |
os.remove(item["path"])
|
| 190 |
except Exception:
|
| 191 |
pass
|
|
|
|
| 192 |
page = item.get("page", 1)
|
| 193 |
caption = item.get("caption", f"Figure (page {page})")
|
| 194 |
safe = re.sub(r"[^\w\-]", "_", caption)[:40]
|
| 195 |
filename = f"page{page}_{safe}.png"
|
|
|
|
| 196 |
image_results.append({
|
| 197 |
"caption": caption,
|
| 198 |
"page": page,
|
| 199 |
"image_data": [{"array": bgr, "filename": filename}] if bgr is not None else [],
|
| 200 |
})
|
|
|
|
| 201 |
return image_results
|
| 202 |
|
| 203 |
|