MinerU

Paused

App Files Files Community

SkyNait commited on Feb 28, 2025

Commit

8f78162

1 Parent(s): 99cd3b7

s3 bucket

Browse files

Files changed (6) hide show

__pycache__/inference_svm_model.cpython-310.pyc +0 -0
__pycache__/mineru_single.cpython-310.pyc +0 -0
__pycache__/table_row_extraction.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
topic_extraction.py +342 -194
worker.py +18 -1

__pycache__/inference_svm_model.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ

__pycache__/mineru_single.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ

__pycache__/table_row_extraction.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

topic_extraction.py CHANGED Viewed

@@ -5,9 +5,12 @@ import gc
 import json
 import logging
 import fitz
 import base64
 import time
 import asyncio
 from io import BytesIO
 from typing import List, Dict, Any
@@ -32,10 +35,57 @@ logger.addHandler(file_handler)
 _GEMINI_CLIENT = None
 def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
-    """
-    Downscale the image to reduce payload size.
-    """
     arr = np.frombuffer(image_data, np.uint8)
     img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
     if img is not None:
@@ -52,9 +102,6 @@ def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -
     return image_data
 def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    """
-    Synchronously call the Gemini API to classify a table image.
-    """
     for attempt in range(max_retries + 1):
         try:
             prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
@@ -72,10 +119,7 @@ The two-column 'table' image include such key features:
 If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
 If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
 If the image does not show a table at all, respond with 'NO_TABLE'.
-Return only one of these exact labels as your entire response:
-TWO_COLUMN
-THREE_COLUMN
-NO_TABLE
 """
             global _GEMINI_CLIENT
             client = _GEMINI_CLIENT
@@ -94,7 +138,7 @@ NO_TABLE
                         ]
                     }
                 ],
-                config=types.GenerateContentConfig(temperature=0.0)
             )
             if resp and resp.text:
                 classification = resp.text.strip().upper()
@@ -104,54 +148,246 @@ NO_TABLE
                     return "TWO_COLUMN"
             return "NO_TABLE"
         except Exception as e:
-            error_msg = str(e)
-            logger.error(f"Gemini table classification error: {error_msg}")
-            if "503" in error_msg:
                 return "NO_TABLE"
             if attempt < max_retries:
-                logger.warning("Retrying classification due to error... attempt %d", attempt + 1)
                 time.sleep(0.5)
             else:
                 return "NO_TABLE"
 async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    """
-    Asynchronous wrapper for image classification.
-    """
     loop = asyncio.get_event_loop()
     preprocessed = preprocess_image(image_data)
     return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
-def unify_whitespace(text: str) -> str:
-    return re.sub(r"\s+", " ", text).strip().lower()
-def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    st_norm = unify_whitespace(search_text)
-    found = []
-    for i in range(doc.page_count):
-        raw = doc[i].get_text("raw")
-        norm = unify_whitespace(raw)
-        if st_norm in norm:
-            found.append(i)
-    doc.close()
-    return sorted(found)
-def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
-    if not page_indices:
-        raise ValueError("No page indices provided for subset creation.")
-    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
-    new_doc = fitz.open()
-    for p in sorted(set(page_indices)):
-        if 0 <= p < doc.page_count:
-            new_doc.insert_pdf(doc, from_page=p, to_page=p)
-        else:
-            logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
-            raise ValueError(f"Page index {p} out of range.")
-    subset_bytes = new_doc.tobytes()
-    new_doc.close()
-    doc.close()
-    return subset_bytes
 class GeminiTopicExtractor:
     def __init__(self, api_key: str = None, num_pages: int = 10):
@@ -163,7 +399,7 @@ class GeminiTopicExtractor:
         if not first_pages_text.strip():
             logger.error("No text from first pages => cannot extract subtopics.")
             return {}
         prompt = f"""
 You have the first pages of a PDF specification, including a table of contents.
@@ -306,8 +542,6 @@ Now, extract topics from this text:
 {first_pages_text}
 """
         global _GEMINI_CLIENT
-        if _GEMINI_CLIENT is None:
-            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
         client = _GEMINI_CLIENT
         try:
             response = client.models.generate_content(
@@ -318,7 +552,6 @@ Now, extract topics from this text:
             if not response or not response.text:
                 logger.warning("No text from LLM => returning empty subtopics.")
                 return {}
             raw_json = response.text.strip()
             cleaned = raw_json.replace("```json", "").replace("```", "")
             try:
@@ -348,7 +581,16 @@ Now, extract topics from this text:
     def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
         text_parts = []
         try:
-            doc = fitz.open(pdf_path)
             pages_to_read = min(num_pages, doc.page_count)
             for i in range(pages_to_read):
                 raw_text = doc[i].get_text("raw")
@@ -358,123 +600,6 @@ Now, extract topics from this text:
             logger.error(f"Could not open PDF: {e}")
         return "\n".join(text_parts)
-class LocalImageWriter(DataWriter):
-    def __init__(self, output_folder: str, gemini_api_key: str):
-        self.output_folder = output_folder
-        os.makedirs(self.output_folder, exist_ok=True)
-        self.descriptions = {}
-        self._img_count = 0
-        self.gemini_api_key = gemini_api_key
-    def write(self, path: str, data: bytes) -> None:
-        self._img_count += 1
-        unique_id = f"img_{self._img_count}.jpg"
-        self.descriptions[path] = {
-            "data": data,
-            "relative_path": unique_id,
-            "table_classification": "NO_TABLE",
-            "final_alt": ""
-        }
-    async def post_process_async(self, key: str, md_content: str) -> str:
-        logger.info("Classifying images to detect tables (async).")
-        tasks = []
-        for p, info in self.descriptions.items():
-            tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
-        for p, task in tasks:
-            try:
-                classification = await task
-                self.descriptions[p]['table_classification'] = classification
-            except Exception as e:
-                logger.error(f"Table classification error: {e}")
-                self.descriptions[p]['table_classification'] = "NO_TABLE"
-        for p, info in self.descriptions.items():
-            cls = info['table_classification']
-            if cls == "TWO_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
-            elif cls == "THREE_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
-            else:
-                info['final_alt'] = "NO_TABLE image"
-        for p, info in self.descriptions.items():
-            old_md = f"![]({key}{p})"
-            new_md = f"![{info['final_alt']}]({info['relative_path']})"
-            md_content = md_content.replace(old_md, new_md)
-        md_content = self._process_table_images_in_markdown(md_content)
-        final_lines = []
-        for line in md_content.split("\n"):
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
-                final_lines.append(line.strip())
-        return "\n".join(final_lines)
-    def post_process(self, key: str, md_content: str) -> str:
-        """
-        Synchronous wrapper around the asynchronous post_process_async.
-        """
-        return asyncio.run(self.post_process_async(key, md_content))
-    def _process_table_images_in_markdown(self, md_content: str) -> str:
-        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
-        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
-        if not matches:
-            return md_content
-        for (col_type, image_id) in matches:
-            logger.info(f"Processing table image => {image_id}, columns={col_type}")
-            temp_path = os.path.join(self.output_folder, image_id)
-            desc_item = None
-            for k, val in self.descriptions.items():
-                if val["relative_path"] == image_id:
-                    desc_item = val
-                    break
-            if not desc_item:
-                logger.warning(f"No matching image data for {image_id}, skipping extraction.")
-                continue
-            if not os.path.exists(temp_path):
-                with open(temp_path, "wb") as f:
-                    f.write(desc_item["data"])
-            try:
-                if col_type.lower() == 'two':
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=True,
-                        enable_subtopic_merge=True,
-                        subtopic_threshold=0.2
-                    )
-                else:
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=False,
-                        enable_subtopic_merge=False,
-                        subtopic_threshold=0.2
-                    )
-                row_boxes = extractor.process_image(temp_path)
-                out_folder = temp_path + "_rows"
-                os.makedirs(out_folder, exist_ok=True)
-                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
-                snippet = ["**Extracted table cells:**"]
-                for i, row in enumerate(row_boxes):
-                    row_dir = os.path.join(out_folder, f"row_{i}")
-                    for j, _ in enumerate(row):
-                        cell_file = f"col_{j}.jpg"
-                        cell_path = os.path.join(row_dir, cell_file)
-                        relp = os.path.relpath(cell_path, self.output_folder)
-                        snippet.append(f"![Row {i} Col {j}]({relp})")
-                new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
-                md_content = md_content.replace(old_line, new_snip)
-            except Exception as e:
-                logger.error(f"Error processing table image {image_id}: {e}")
-            finally:
-                if os.path.exists(temp_path):
-                    os.remove(temp_path)
-        return md_content
 class MineruNoTextProcessor:
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
@@ -486,6 +611,20 @@ class MineruNoTextProcessor:
         self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
     def cleanup_gpu(self):
         try:
             gc.collect()
@@ -498,18 +637,27 @@ class MineruNoTextProcessor:
         logger.info(f"Processing PDF: {pdf_path}")
         try:
             subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
             logger.info(f"Gemini returned subtopics: {subtopics}")
-            with open(pdf_path, "rb") as f:
-                pdf_bytes = f.read()
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
             final_pages = set()
             if not subtopics:
-                logger.warning("No subtopics found. We'll process the entire PDF as fallback.")
                 final_pages = set(range(total_pages))
             else:
                 for subname, rng in subtopics.items():
@@ -529,16 +677,10 @@ class MineruNoTextProcessor:
                             chosen_page = p
                             break
                     if chosen_page is None:
-                        if occs:
-                            chosen_page = occs[-1]
-                            logger.warning(f"No occurrence >= {doc_start_0} for '{subname}'. Using last => {chosen_page}")
-                        else:
-                            chosen_page = 0
-                            logger.warning(f"No occurrences for '{subname}'. Using page 0.")
                     raw_offset = chosen_page - doc_start_0
                     offset = max(0, raw_offset)
-                    logger.info(f"Subtopic '{subname}': doc_start={start_p}, chosen_page={chosen_page}, raw_offset={raw_offset}, offset={offset}")
                     s0 = (start_p - 1) + offset
                     e0 = (end_p - 1) + offset
                     s0 = max(0, min(total_pages - 1, s0))
@@ -546,12 +688,11 @@ class MineruNoTextProcessor:
                     for pp in range(s0, e0 + 1):
                         final_pages.add(pp)
             if not final_pages:
-                logger.warning("No valid pages after offset. We'll process entire PDF.")
                 final_pages = set(range(total_pages))
             logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
             subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
@@ -562,18 +703,24 @@ class MineruNoTextProcessor:
                 table_enable=self.table_enable
             )
             logger.info("doc_analyze complete. Extracting images.")
-            writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
             pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
-            md_content = pipe_result.get_markdown("local-unique-prefix/")
-            final_markdown = writer.post_process("local-unique-prefix/", md_content)
-            out_path = os.path.join(self.output_folder, "final_output.md")
-            with open(out_path, "w", encoding="utf-8") as f:
-                f.write(final_markdown)
             return final_markdown
         finally:
             self.cleanup_gpu()
@@ -584,5 +731,6 @@ if __name__ == "__main__":
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
         md_output = processor.process(input_pdf)
     except Exception as e:
         logger.error(f"Processing failed: {e}")

 import json
 import logging
 import fitz
+import boto3
 import base64
 import time
 import asyncio
+import tempfile
+import requests
 from io import BytesIO
 from typing import List, Dict, Any
 _GEMINI_CLIENT = None
+def unify_whitespace(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip()
+def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    st_norm = unify_whitespace(search_text)
+    found = []
+    for i in range(doc.page_count):
+        raw = doc[i].get_text("raw")
+        norm = unify_whitespace(raw)
+        if st_norm in norm:
+            found.append(i)
+    doc.close()
+    return sorted(found)
+def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
+    if not page_indices:
+        raise ValueError("No page indices provided for subset creation.")
+    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
+    new_doc = fitz.open()
+    for p in sorted(set(page_indices)):
+        if 0 <= p < doc.page_count:
+            new_doc.insert_pdf(doc, from_page=p, to_page=p)
+        else:
+            logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
+            raise ValueError(f"Page index {p} out of range.")
+    subset_bytes = new_doc.tobytes()
+    new_doc.close()
+    doc.close()
+    return subset_bytes
+class s3Writer:
+    def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
+        self.bucket = bucket
+        self.client = boto3.client(
+            's3',
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=endpoint_url
+        )
+    def write(self, path: str, data: bytes) -> None:
+        file_obj = BytesIO(data)
+        self.client.upload_fileobj(
+            file_obj,
+            self.bucket, path
+        )
+        logger.info(f"Uploaded to S3: {path}")
+#reduce img size, save time for gemini call
 def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
     arr = np.frombuffer(image_data, np.uint8)
     img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
     if img is not None:
     return image_data
 def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
     for attempt in range(max_retries + 1):
         try:
             prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
 If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
 If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
 If the image does not show a table at all, respond with 'NO_TABLE'.
+Return only one of these exact labels.
 """
             global _GEMINI_CLIENT
             client = _GEMINI_CLIENT
                         ]
                     }
                 ],
+                config=types.GenerateContentConfig(temperature=0.)
             )
             if resp and resp.text:
                 classification = resp.text.strip().upper()
                     return "TWO_COLUMN"
             return "NO_TABLE"
         except Exception as e:
+            logger.error(f"Gemini table classification error: {e}")
+            if "503" in str(e):
                 return "NO_TABLE"
             if attempt < max_retries:
                 time.sleep(0.5)
             else:
                 return "NO_TABLE"
 async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
     loop = asyncio.get_event_loop()
     preprocessed = preprocess_image(image_data)
     return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
+class S3ImageWriter(DataWriter):
+    def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
+        self.s3_writer = s3_writer
+        self.base_path = base_path if base_path.endswith("/") else base_path + "/"
+        self.gemini_api_key = gemini_api_key
+        self.descriptions = {}
+        self._img_count = 0
+    def write(self, path: str, data: bytes) -> None:
+        self._img_count += 1
+        unique_id = f"img_{self._img_count}.jpg"
+        s3_key = f"{self.base_path}{unique_id}"
+        self.s3_writer.write(s3_key, data)
+        self.descriptions[path] = {
+            "data": data,
+            "s3_path": s3_key,
+            "table_classification": "NO_TABLE",
+            "final_alt": ""
+        }
+    async def post_process_async(self, key: str, md_content: str) -> str:
+        logger.info("Classifying images to detect tables.")
+        tasks = []
+        for p, info in self.descriptions.items():
+            tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
+        for p, task in tasks:
+            try:
+                classification = await task
+                self.descriptions[p]['table_classification'] = classification
+            except Exception as e:
+                logger.error(f"Table classification error: {e}")
+                self.descriptions[p]['table_classification'] = "NO_TABLE"
+        for p, info in self.descriptions.items():
+            cls = info['table_classification']
+            if cls == "TWO_COLUMN":
+                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
+            elif cls == "THREE_COLUMN":
+                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
+            else:
+                info['final_alt'] = "NO_TABLE image"
+            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
+        md_content = await self._process_table_images_in_markdown(key, md_content)
+        final_lines = []
+        for line in md_content.split("\n"):
+            if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
+                final_lines.append(line.strip())
+        return "\n".join(final_lines)
+    async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
+        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
+        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
+        if not matches:
+            return md_content
+        for (col_type, s3_key) in matches:
+            logger.info(f"Processing table image: {s3_key}, columns={col_type}")
+            img_data = None
+            for desc in self.descriptions.values():
+                if desc.get("s3_path") == s3_key:
+                    img_data = desc.get("data")
+                    break
+            if img_data is None:
+                logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
+                continue
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
+                temp_file.write(img_data)
+                temp_path = temp_file.name
+            try:
+                if col_type.lower() == 'two':
+                    extractor = TableExtractor(
+                        skip_header=True,
+                        merge_two_col_rows=True,
+                        enable_subtopic_merge=True,
+                        subtopic_threshold=0.2
+                    )
+                else:
+                    extractor = TableExtractor(
+                        skip_header=True,
+                        merge_two_col_rows=False,
+                        enable_subtopic_merge=False,
+                        subtopic_threshold=0.2
+                    )
+                row_boxes = extractor.process_image(temp_path)
+                snippet = ["**Extracted table cells:**"]
+                for i, row in enumerate(row_boxes):
+                    for j, _ in enumerate(row):
+                        cell_unique_key = f"{self.base_path}cells/{os.path.basename(s3_key).split('.')[0]}_row{i}_col{j}.jpg"
+                        self.s3_writer.write(cell_unique_key, img_data)
+                        snippet.append(f"![Row {i} Col {j}]({cell_unique_key})")
+                new_snip = "\n".join(snippet)
+                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
+                md_content = md_content.replace(old_line, new_snip)
+            except Exception as e:
+                logger.error(f"Error processing table image {s3_key}: {e}")
+            finally:
+                try:
+                    os.remove(temp_path)
+                except Exception:
+                    pass
+        return md_content
+    def post_process(self, key: str, md_content: str) -> str:
+        return asyncio.run(self.post_process_async(key, md_content))
+#test
+class LocalImageWriter(DataWriter):
+    def __init__(self, output_folder: str, gemini_api_key: str):
+        self.output_folder = output_folder
+        os.makedirs(self.output_folder, exist_ok=True)
+        self.descriptions = {}
+        self._img_count = 0
+        self.gemini_api_key = gemini_api_key
+    def write(self, path: str, data: bytes) -> None:
+        self._img_count += 1
+        unique_id = f"img_{self._img_count}.jpg"
+        self.descriptions[path] = {
+            "data": data,
+            "relative_path": unique_id,
+            "table_classification": "NO_TABLE",
+            "final_alt": ""
+        }
+    async def post_process_async(self, key: str, md_content: str) -> str:
+        logger.info("Classifying images to detect tables.")
+        tasks = []
+        for p, info in self.descriptions.items():
+            tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
+        for p, task in tasks:
+            try:
+                classification = await task
+                self.descriptions[p]['table_classification'] = classification
+            except Exception as e:
+                logger.error(f"Table classification error: {e}")
+                self.descriptions[p]['table_classification'] = "NO_TABLE"
+        for p, info in self.descriptions.items():
+            cls = info['table_classification']
+            if cls == "TWO_COLUMN":
+                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
+            elif cls == "THREE_COLUMN":
+                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
+            else:
+                info['final_alt'] = "NO_TABLE image"
+            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['relative_path']})")
+        md_content = self._process_table_images_in_markdown(md_content)
+        final_lines = []
+        for line in md_content.split("\n"):
+            if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
+                final_lines.append(line.strip())
+        return "\n".join(final_lines)
+    def _process_table_images_in_markdown(self, md_content: str) -> str:
+        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
+        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
+        if not matches:
+            return md_content
+        for (col_type, image_id) in matches:
+            logger.info(f"Processing table image => {image_id}, columns={col_type}")
+            temp_path = os.path.join(self.output_folder, image_id)
+            desc_item = None
+            for k, val in self.descriptions.items():
+                if val["relative_path"] == image_id:
+                    desc_item = val
+                    break
+            if not desc_item:
+                logger.warning(f"No matching image data for {image_id}, skipping extraction.")
+                continue
+            if not os.path.exists(temp_path):
+                with open(temp_path, "wb") as f:
+                    f.write(desc_item["data"])
+            try:
+                if col_type.lower() == 'two':
+                    extractor = TableExtractor(
+                        skip_header=True,
+                        merge_two_col_rows=True,
+                        enable_subtopic_merge=True,
+                        subtopic_threshold=0.2
+                    )
+                else:
+                    extractor = TableExtractor(
+                        skip_header=True,
+                        merge_two_col_rows=False,
+                        enable_subtopic_merge=False,
+                        subtopic_threshold=0.2
+                    )
+                row_boxes = extractor.process_image(temp_path)
+                out_folder = temp_path + "_rows"
+                os.makedirs(out_folder, exist_ok=True)
+                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
+                snippet = ["**Extracted table cells:**"]
+                for i, row in enumerate(row_boxes):
+                    row_dir = os.path.join(out_folder, f"row_{i}")
+                    for j, _ in enumerate(row):
+                        cell_file = f"col_{j}.jpg"
+                        cell_path = os.path.join(row_dir, cell_file)
+                        relp = os.path.relpath(cell_path, self.output_folder)
+                        snippet.append(f"![Row {i} Col {j}]({relp})")
+                new_snip = "\n".join(snippet)
+                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
+                md_content = md_content.replace(old_line, new_snip)
+            except Exception as e:
+                logger.error(f"Error processing table image {image_id}: {e}")
+            finally:
+                if os.path.exists(temp_path):
+                    os.remove(temp_path)
+        return md_content
+    def post_process(self, key: str, md_content: str) -> str:
+        return asyncio.run(self.post_process_async(key, md_content))
 class GeminiTopicExtractor:
     def __init__(self, api_key: str = None, num_pages: int = 10):
         if not first_pages_text.strip():
             logger.error("No text from first pages => cannot extract subtopics.")
             return {}
         prompt = f"""
 You have the first pages of a PDF specification, including a table of contents.
 {first_pages_text}
 """
         global _GEMINI_CLIENT
         client = _GEMINI_CLIENT
         try:
             response = client.models.generate_content(
             if not response or not response.text:
                 logger.warning("No text from LLM => returning empty subtopics.")
                 return {}
             raw_json = response.text.strip()
             cleaned = raw_json.replace("```json", "").replace("```", "")
             try:
     def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
         text_parts = []
         try:
+            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
+                response = requests.get(pdf_path)
+                if response.status_code != 200:
+                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
+                    return ""
+                pdf_bytes = response.content
+            else:
+                with open(pdf_path, "rb") as f:
+                    pdf_bytes = f.read()
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             pages_to_read = min(num_pages, doc.page_count)
             for i in range(pages_to_read):
                 raw_text = doc[i].get_text("raw")
             logger.error(f"Could not open PDF: {e}")
         return "\n".join(text_parts)
 class MineruNoTextProcessor:
     def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
         self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
+        if (os.getenv("S3_ACCESS_KEY") and os.getenv("S3_SECRET_KEY") and
+            os.getenv("S3_BUCKET_NAME") and os.getenv("S3_ENDPOINT")):
+            self.use_s3 = True
+            self.s3_writer = s3Writer(
+                ak=os.getenv("S3_ACCESS_KEY"),
+                sk=os.getenv("S3_SECRET_KEY"),
+                bucket=os.getenv("S3_BUCKET_NAME"),
+                endpoint_url=os.getenv("S3_ENDPOINT")
+            )
+            self.base_path = "topic_extraction/"
+        else:
+            self.use_s3 = False
     def cleanup_gpu(self):
         try:
             gc.collect()
         logger.info(f"Processing PDF: {pdf_path}")
         try:
             subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
             logger.info(f"Gemini returned subtopics: {subtopics}")
+            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
+                response = requests.get(pdf_path)
+                if response.status_code != 200:
+                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
+                    raise Exception(f"Failed to download PDF: {pdf_path}")
+                pdf_bytes = response.content
+                logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
+            else:
+                with open(pdf_path, "rb") as f:
+                    pdf_bytes = f.read()
+                logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
             final_pages = set()
             if not subtopics:
+                logger.warning("No subtopics found. Processing entire PDF as fallback.")
                 final_pages = set(range(total_pages))
             else:
                 for subname, rng in subtopics.items():
                             chosen_page = p
                             break
                     if chosen_page is None:
+                        chosen_page = occs[-1] if occs else 0
+                        logger.warning(f"No suitable occurrence for '{subname}'. Using page {chosen_page}.")
                     raw_offset = chosen_page - doc_start_0
                     offset = max(0, raw_offset)
                     s0 = (start_p - 1) + offset
                     e0 = (end_p - 1) + offset
                     s0 = max(0, min(total_pages - 1, s0))
                     for pp in range(s0, e0 + 1):
                         final_pages.add(pp)
             if not final_pages:
+                logger.warning("No valid pages after offset. Processing entire PDF.")
                 final_pages = set(range(total_pages))
             logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
             subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
                 table_enable=self.table_enable
             )
             logger.info("doc_analyze complete. Extracting images.")
+            if self.use_s3:
+                writer = S3ImageWriter(self.s3_writer, self.base_path, self.gemini_api_key)
+                md_prefix = self.base_path
+            else:
+                writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
+                md_prefix = "local-unique-prefix/"
             pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
+            md_content = pipe_result.get_markdown(md_prefix)
+            final_markdown = writer.post_process(md_prefix, md_content)
+            if self.use_s3:
+                final_md_key = f"{self.base_path}final_output.md"
+                self.s3_writer.write(final_md_key, final_markdown.encode("utf-8"))
+                logger.info(f"Final markdown uploaded to S3 at {final_md_key}")
+            else:
+                out_path = os.path.join(self.output_folder, "final_output.md")
+                with open(out_path, "w", encoding="utf-8") as f:
+                    f.write(final_markdown)
             return final_markdown
         finally:
             self.cleanup_gpu()
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
         md_output = processor.process(input_pdf)
+        logger.info("Processing completed successfully.")
     except Exception as e:
         logger.error(f"Processing failed: {e}")

worker.py CHANGED Viewed

@@ -121,7 +121,24 @@ class RabbitMQWorker:
                     logger.error("[Worker %s] Failed to publish results.", thread_id)
                 logger.info("[Worker %s] Contexts: %s", thread_id, contexts)
             else:
                 ch.basic_ack(delivery_tag=method.delivery_tag, requeue=False)
                 logger.warning("[Worker %s] Unknown pattern type in headers: %s", thread_id, pattern)

                     logger.error("[Worker %s] Failed to publish results.", thread_id)
                 logger.info("[Worker %s] Contexts: %s", thread_id, contexts)
+            elif pattern == "extract_topics":
+                data = body_dict.get("data")
+                pdf_path = data.get("pdf_path") #url
+                topic_processor = MineruNoTextProcessor(gemini_api_key=os.getenv("GEMINI_API_KEY"))
+                try:
+                    topics_markdown = topic_processor.process(pdf_path)
+                    data["topics_markdown"] = topics_markdown
+                    body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
+                    body_dict["data"] = data
+                    if self.publish_message(body_dict, headers):
+                        ch.basic_ack(delivery_tag=method.delivery_tag)
+                    else:
+                        ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
+                except Exception as e:
+                    logger.error(f"Error processing topic extraction: {e}")
+                    ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
             else:
                 ch.basic_ack(delivery_tag=method.delivery_tag, requeue=False)
                 logger.warning("[Worker %s] Unknown pattern type in headers: %s", thread_id, pattern)