MinerU

Paused

App Files Files Community

SkyNait commited on Mar 5, 2025

Commit

6fc2b3e

1 Parent(s): 5e4a36e

correct RabbitMQ

Browse files

Files changed (10) hide show

__pycache__/table_row_extraction.cpython-310.pyc +0 -0
__pycache__/topic_extr.cpython-310.pyc +0 -0
__pycache__/worker.cpython-310.pyc +0 -0
input_output/168982-specification-gcse-mathematics.pdf +3 -0
page_range.py +0 -300
table_row_extraction.py +0 -441
topic_extr.py +134 -156
topic_extraction.py +0 -988
topic_extraction.log → topic_processor.log +0 -0
worker.py +16 -24

__pycache__/table_row_extraction.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/table_row_extraction.cpython-310.pyc and b/__pycache__/table_row_extraction.cpython-310.pyc differ

__pycache__/topic_extr.cpython-310.pyc ADDED Viewed

Binary file (7.56 kB). View file

__pycache__/worker.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ

input_output/168982-specification-gcse-mathematics.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf3ff38c2035447e51b0d6dc8df35eeb1d5cc9296d77f310d71fe1d39c66062a
+size 13646315

page_range.py DELETED Viewed

@@ -1,300 +0,0 @@
-#!/usr/bin/env python3
-import os
-import re
-import json
-import logging
-import fitz
-import requests
-import time
-from statistics import mode, median
-from typing import Dict, List, Tuple
-from google import genai
-from google.genai import types
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> list:
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    st_norm = re.sub(r"\s+", " ", search_text).strip()
-    found = []
-    for i in range(doc.page_count):
-        raw = doc[i].get_text("raw")
-        norm = re.sub(r"\s+", " ", raw).strip()
-        if st_norm in norm:
-            found.append(i)
-    doc.close()
-    return sorted(found)
-class GeminiTopicExtractor:
-    def __init__(self, api_key: str = None, num_pages: int = 20):
-        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
-        self.num_pages = num_pages
-    def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
-        text_parts = []
-        try:
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    return ""
-                pdf_bytes = response.content
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            pages_to_read = min(num_pages, doc.page_count)
-            for i in range(pages_to_read):
-                raw_text = doc[i].get_text("raw")
-                text_parts.append(raw_text)
-            doc.close()
-        except Exception as e:
-            logger.error(f"Could not open PDF: {e}")
-        return "\n".join(text_parts)
-    def extract_subtopics(self, pdf_path: str) -> dict:
-        first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
-        if not first_pages_text.strip():
-            logger.error("No text from first pages => cannot extract subtopics.")
-            return {}
-        prompt = f"""
-You have the first pages of a PDF specification, including a table of contents.
-Instructions:
-1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
-2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
-3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
-4. Output only valid JSON of the form:
-    {{
-    "Subtopic A": [start_page, end_page],
-    "Subtopic B": [start_page, end_page]
-    }}
-5. If you can't find any subtopics, return an empty JSON.
-Important notes:
-- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
-- The final output must be valid JSON only, with no extra text or code blocks.
-Examples:
-1. Given this table of contents:
-1 Introduction – 2
-    Why choose Edexcel A Level Mathematics? - 2
-    Supporting you in planning and implementing this qualification - 3
-    Qualification at a glance - 5
-2 Subject content and assessment information – 7
-    Paper 1 and Paper 2: Pure Mathematics - 11
-    Paper 3: Statistics and Mechanics - 30
-    Assessment Objectives - 40
-3 Administration and general information – 42
-    Entries - 42
-    Access arrangements, reasonable adjustments, special consideration and malpractice - 42
-    Student recruitment and progression - 45
-The correct output should be:
-{{
-    "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-    "Paper 3: Statistics and Mechanics": [30, 38]
-}}
-2. Given this table of contents:
-Qualification at a glance – 1
-    Assessment Objectives and weightings - 4
-Knowledge, skills and understanding – 5
-    Theme 1: Introduction to markets and market failure - 5
-    Theme 2: The UK economy – performance and policies - 11
-    Theme 3: Business behaviour and the labour market - 21
-    Theme 4: A global perspective - 29
-Assessment – 39
-    Assessment summary - 39
-    Assessment objectives - 41
-    Assessment overview - 42
-    Breakdown of assessment objectives - 42
-The correct output should be:
-{{
-    "Theme 1: Introduction to markets and market failure": [5, 10],
-    "Theme 2: The UK economy – performance and policies": [11, 20],
-    "Theme 3: Business behaviour and the labour market": [21, 28],
-    "Theme 4: A global perspective": [29, 38]
-}}
-3. You might also see sections like:
-2.1 AS Unit 1 11
-2.2 AS Unit 2 18
-2.3 A2 Unit 3 24
-2.4 A2 Unit 4 31
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [11, 17],
-    "2.2 AS Unit 2": [18, 23],
-    "2.3 A2 Unit 3": [24, 30],
-    "2.4 A2 Unit 4": [31, 35]
-}}
-or
-2.1 AS units 6
-2.2 AS units 23
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [6, 2],
-    "2.2 AS Unit 2": [23, 43]
-}}
-4. Another example might list subtopics:
-3.1 Overarching themes 11
-3.2 A: Proof 12
-3.3 B: Algebra and functions 13
-3.4 C: Coordinate geometry in the ( x , y ) plane 14
-3.5 D: Sequences and series 15
-3.6 E: Trigonometry 16
-3.7 F: Exponentials and logarithms 17
-3.8 G: Differentiation 18
-3.9 H: Integration 19
-3.10 I: Numerical methods 20
-3.11 J: Vectors 20
-3.12 K: Statistical sampling 21
-3.13 L: Data presentation and interpretation 21
-3.14 M: Probability 22
-3.15 N: Statistical distributions 23
-3.16 O: Statistical hypothesis testing 23
-3.17 P: Quantities and units in mechanics 24
-3.18 Q: Kinematics 24
-3.19 R: Forces and Newton’s laws 24
-3.20 S: Moments 25
-3.21 Use of data in statistics 26
-Here the correct output might look like:
-{{
-    "A: Proof": [12, 12],
-    "B: Algebra and functions": [13, 13],
-    ...
-}}
-Now, extract topics from this text:
-{first_pages_text}
-"""
-        global _GEMINI_CLIENT
-        if '_GEMINI_CLIENT' not in globals() or _GEMINI_CLIENT is None:
-            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
-        client = _GEMINI_CLIENT
-        try:
-            response = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[prompt],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if not response or not response.text:
-                logger.warning("No text from LLM => returning empty subtopics.")
-                return {}
-            raw_json = response.text.strip()
-            cleaned = raw_json.replace("```json", "").replace("```", "")
-            try:
-                data = json.loads(cleaned)
-            except Exception as json_err:
-                logger.error(f"JSON parsing error: {json_err}")
-                return {}
-            final_dict = {}
-            found_sub_dict = None
-            for k, v in data.items():
-                if isinstance(v, dict):
-                    found_sub_dict = v
-                    break
-            if found_sub_dict is not None:
-                for subk, rng in found_sub_dict.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            else:
-                for subk, rng in data.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            return final_dict
-        except Exception as e:
-            logger.error(f"Gemini subtopic extraction error: {e}")
-            return {}
-class TopicRangeExtractor:
-    def __init__(self, gemini_api_key: str):
-        self.gemini_api_key = gemini_api_key
-        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
-    def process(self, pdf_path: str) -> dict:
-        logger.info(f"Processing PDF: {pdf_path}")
-        subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
-        logger.info(f"Gemini returned subtopics: {subtopics}")
-        if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-            response = requests.get(pdf_path)
-            if response.status_code != 200:
-                logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                raise Exception(f"Failed to download PDF: {pdf_path}")
-            pdf_bytes = response.content
-            logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
-        else:
-            with open(pdf_path, "rb") as f:
-                pdf_bytes = f.read()
-            logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
-        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-        total_pages = doc.page_count
-        doc.close()
-        if not subtopics:
-            return {"page_range": list(range(total_pages))}
-        offset_candidates = []
-        subtopics_corrected = {}
-        for subname, rng in subtopics.items():
-            if not (isinstance(rng, list) and len(rng) == 2):
-                continue
-            start_p, end_p = rng
-            occs = find_all_occurrences(pdf_bytes, subname)
-            for p in occs:
-                candidate = p - (start_p - 1)
-                if candidate > 0:
-                    offset_candidates.append(candidate)
-            subtopics_corrected[subname] = rng
-        if offset_candidates:
-            try:
-                global_offset = mode(offset_candidates)
-            except Exception:
-                global_offset = int(median(offset_candidates))
-        else:
-            global_offset = 0
-        logger.info(f"Computed global offset: {global_offset}")
-        adjusted_subtopics = []
-        for subname, rng in subtopics_corrected.items():
-            start_p, end_p = rng
-            s0 = (start_p) + global_offset
-            e0 = (end_p - 1) + global_offset
-            adjusted_subtopics.append((subname, (s0, e0)))
-        sorted_subtopics = sorted(adjusted_subtopics, key=lambda x: x[1][0])
-        final_subtopics = []
-        for i in range(len(sorted_subtopics)):
-            subname, (s0, e0) = sorted_subtopics[i]
-            if i < len(sorted_subtopics) - 1:
-                next_s0 = sorted_subtopics[i + 1][1][0]
-                new_e0 = min(e0, next_s0 - 1)
-            else:
-                new_e0 = min(e0, total_pages - 1)
-            final_subtopics.append((subname, (s0, new_e0)))
-        real_pages_set = set()
-        for subname, (s0, e0) in final_subtopics:
-            for pp in range(s0, e0 + 1):
-                if 0 <= pp < total_pages:
-                    real_pages_set.add(pp)
-        page_range = sorted(real_pages_set)
-        logger.info(f"Final page range: {page_range}")
-        return {"page_range": page_range}
-if __name__ == "__main__":
-    input_pdf = "/home/user/app/input_output/pearson-A_Level_Economics.pdf"
-    gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-    try:
-        extractor = TopicRangeExtractor(gemini_api_key=gemini_key)
-        result = extractor.process(input_pdf)
-        # print(json.dumps(result, indent=2))
-    except Exception as e:
-        logger.error(f"Processing failed: {e}")

table_row_extraction.py DELETED Viewed

@@ -1,441 +0,0 @@
-import cv2
-import numpy as np
-import math
-import logging
-from pathlib import Path
-from typing import List, Tuple
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# if you are working with 3-column tables, change `merge_two_col_rows` and `enable_subtopic_merge` to False
-# otherwise set them to True if you are working with 2-column tables  (currently hardcoded, just test)
-def color_distance(c1: Tuple[float, float, float],
-                   c2: Tuple[float, float, float]) -> float:
-    """
-    Euclidean distance between two BGR colors c1 and c2.
-    """
-    return math.sqrt((c1[0] - c2[0])**2 + (c1[1] - c2[1])**2 + (c1[2] - c2[2])**2)
-def average_bgr(cell_img: np.ndarray) -> Tuple[float, float, float]:
-    """
-    Return the average BGR color of the entire cell_img.
-    """
-    b_mean = np.mean(cell_img[:, :, 0])
-    g_mean = np.mean(cell_img[:, :, 1])
-    r_mean = np.mean(cell_img[:, :, 2])
-    return (b_mean, g_mean, r_mean)
-class TableExtractor:
-    def __init__(
-            self,
-            # --- Preprocessing ---
-            denoise_h: int = 10,
-            clahe_clip: float = 3.0,
-            clahe_grid: int = 8,
-            sharpen_kernel: np.ndarray = np.array([[-1, -1, -1],
-                                                   [-1,  9, -1],
-                                                   [-1, -1, -1]]),
-            thresh_block_size: int = 21,
-            thresh_C: int = 7,
-            # --- Row detection ---
-            horizontal_scale: int = 20,
-            row_morph_iterations: int = 1,
-            min_row_height: int = 15,
-            min_row_density: float = 0.01,
-            # Additional row detection parameters
-            faint_line_threshold_factor: float = 0.1,
-            top_line_grouping_px: int = 8,
-            some_minimum_text_pixels: int = 50,
-            # --- Column detection ---
-            vertical_scale: int = 20,
-            col_morph_iterations: int = 2,
-            min_col_height_ratio: float = 0.5,
-            min_col_density: float = 0.01,
-            # --- Bbox extraction ---
-            padding: int = 0,
-            skip_header: bool = True,
-            # --- Two-column & subtopic merges ---
-            merge_two_col_rows: bool = True,
-            enable_subtopic_merge: bool = True,
-            subtopic_threshold: float = 0.2,
-            # --- Color-based artifact filter ---
-            artifact_color_a6: Tuple[int, int, int] = (166, 166, 166),
-            artifact_color_a7: Tuple[int, int, int] = (180, 180, 180),
-            artifact_color_a8: Tuple[int, int, int] = (80, 48, 0),
-            artifact_color_a9: Tuple[int, int, int] = (223, 153, 180),
-            artifact_color_a10: Tuple[int, int, int] = (0, 0, 0),
-            color_tolerance: float = 30.0
-    ):
-        # Preprocessing
-        self.denoise_h = denoise_h
-        self.clahe_clip = clahe_clip
-        self.clahe_grid = clahe_grid
-        self.sharpen_kernel = sharpen_kernel
-        self.thresh_block_size = thresh_block_size
-        self.thresh_C = thresh_C
-        # Row detection
-        self.horizontal_scale = horizontal_scale
-        self.row_morph_iterations = row_morph_iterations
-        self.min_row_height = min_row_height
-        self.min_row_density = min_row_density
-        # Additional row detection
-        self.faint_line_threshold_factor = faint_line_threshold_factor
-        self.top_line_grouping_px = top_line_grouping_px
-        self.some_minimum_text_pixels = some_minimum_text_pixels
-        # Column detection
-        self.vertical_scale = vertical_scale
-        self.col_morph_iterations = col_morph_iterations
-        self.min_col_height_ratio = min_col_height_ratio
-        self.min_col_density = min_col_density
-        # Bbox extraction
-        self.padding = padding
-        self.skip_header = skip_header
-        # Two-column & subtopic merges
-        self.merge_two_col_rows = merge_two_col_rows
-        self.enable_subtopic_merge = enable_subtopic_merge
-        self.subtopic_threshold = subtopic_threshold
-        # Color-based artifact filter
-        self.artifact_color_a6 = artifact_color_a6
-        self.artifact_color_a7 = artifact_color_a7
-        self.artifact_color_a8 = artifact_color_a8
-        self.artifact_color_a9 = artifact_color_a9
-        self.artifact_color_a10 = artifact_color_a10
-        self.color_tolerance = color_tolerance
-    def preprocess(self, img: np.ndarray) -> np.ndarray:
-        """
-        Grayscale, denoise, CLAHE, sharpen, then adaptive threshold (binary_inv).
-        """
-        if img.ndim == 3:
-            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        else:
-            gray = img.copy()
-        denoised = cv2.fastNlMeansDenoising(gray, h=self.denoise_h)
-        clahe = cv2.createCLAHE(clipLimit=self.clahe_clip,
-                                tileGridSize=(self.clahe_grid, self.clahe_grid))
-        enhanced = clahe.apply(denoised)
-        sharpened = cv2.filter2D(enhanced, -1, self.sharpen_kernel)
-        binarized = cv2.adaptiveThreshold(
-            sharpened, 255,
-            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-            cv2.THRESH_BINARY_INV,
-            self.thresh_block_size,
-            self.thresh_C
-        )
-        return binarized
-    def detect_full_rows(self, bin_img: np.ndarray) -> List[Tuple[int, int]]:
-        h_kernel_size = max(1, bin_img.shape[1] // self.horizontal_scale)
-        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_kernel_size, 1))
-        horizontal_lines = cv2.morphologyEx(
-            bin_img, cv2.MORPH_OPEN, horizontal_kernel,
-            iterations=self.row_morph_iterations
-        )
-        row_projection = np.sum(horizontal_lines, axis=1)
-        max_val = np.max(row_projection) if len(row_projection) else 0
-        if max_val < 1e-5:
-            return [(0, bin_img.shape[0])]
-        threshold_val = self.faint_line_threshold_factor * max_val
-        line_indices = np.where(row_projection > threshold_val)[0]
-        if len(line_indices) < 2:
-            return [(0, bin_img.shape[0])]
-        lines = []
-        group = [line_indices[0]]
-        for i in range(1, len(line_indices)):
-            if (line_indices[i] - line_indices[i - 1]) <= self.top_line_grouping_px:
-                group.append(line_indices[i])
-            else:
-                lines.append(int(np.mean(group)))
-                group = [line_indices[i]]
-        if group:
-            lines.append(int(np.mean(group)))
-        potential_bounds = []
-        for i in range(len(lines) - 1):
-            y1 = lines[i]
-            y2 = lines[i + 1]
-            if (y2 - y1) > 0:
-                potential_bounds.append((y1, y2))
-        if potential_bounds:
-            if potential_bounds[0][0] > 0:
-                potential_bounds.insert(0, (0, potential_bounds[0][0]))
-            if potential_bounds[-1][1] < bin_img.shape[0]:
-                potential_bounds.append((potential_bounds[-1][1], bin_img.shape[0]))
-        else:
-            potential_bounds = [(0, bin_img.shape[0])]
-        final_rows = []
-        for (y1, y2) in potential_bounds:
-            height = (y2 - y1)
-            region = bin_img[y1:y2, :]
-            white_count = np.sum(region == 255)
-            if height < self.min_row_height:
-                if white_count >= self.some_minimum_text_pixels:
-                    final_rows.append((y1, y2))
-            else:
-                final_rows.append((y1, y2))
-        final_rows = sorted(final_rows, key=lambda x: x[0])
-        return final_rows if final_rows else [(0, bin_img.shape[0])]
-    def detect_columns_in_row(self,
-                              row_img: np.ndarray,
-                              y1: int,
-                              y2: int) -> List[Tuple[int, int, int, int]]:
-        row_height = (y2 - y1)
-        row_width = row_img.shape[1]
-        v_kernel_size = max(1, row_height // self.vertical_scale)
-        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_kernel_size))
-        vertical_lines = cv2.morphologyEx(
-            row_img, cv2.MORPH_OPEN, vertical_kernel,
-            iterations=self.col_morph_iterations
-        )
-        vertical_lines = cv2.dilate(vertical_lines,
-                                    np.ones((3, 3), np.uint8),
-                                    iterations=1)
-        # Find contours => x positions
-        contours, _ = cv2.findContours(vertical_lines,
-                                       cv2.RETR_EXTERNAL,
-                                       cv2.CHAIN_APPROX_SIMPLE)
-        x_positions = []
-        for c in contours:
-            x, _, w, h = cv2.boundingRect(c)
-            # Must be at least half the row height to be a real divider
-            if h >= self.min_col_height_ratio * row_height:
-                x_positions.append(x)
-        x_positions = sorted(set(x_positions))
-        # Keep at most 2 vertical lines
-        if len(x_positions) > 2:
-            x_positions = x_positions[:2]
-        # Build bounding boxes
-        if len(x_positions) == 0:
-            # 0 lines => single bounding box
-            boxes = [(0, y1, row_width, row_height)]
-        elif len(x_positions) == 1:
-            # 1 line => 2 bounding boxes by default
-            x1 = x_positions[0]
-            if self.merge_two_col_rows:
-                # Merge => single bounding box
-                boxes = [(0, y1, row_width, row_height)]
-            else:
-                boxes = [
-                    (0,    y1, x1,             row_height),
-                    (x1,   y1, row_width - x1, row_height)
-                ]
-        else:
-            # 2 lines => normally 3 bounding boxes
-            x1, x2 = sorted(x_positions)
-            if self.enable_subtopic_merge:
-                # If left bounding box is very narrow => treat as subtopic => 2 boxes
-                if x1 < (self.subtopic_threshold * row_width):
-                    boxes = [
-                        (0,  y1, x1,             row_height),
-                        (x1, y1, row_width - x1, row_height)
-                    ]
-                else:
-                    boxes = [
-                        (0,  y1, x1,             row_height),
-                        (x1, y1, x2 - x1,        row_height),
-                        (x2, y1, row_width - x2, row_height)
-                    ]
-            else:
-                boxes = [
-                    (0,  y1, x1,             row_height),
-                    (x1, y1, x2 - x1,        row_height),
-                    (x2, y1, row_width - x2, row_height)
-                ]
-        # Filter out columns with insufficient density
-        filtered = []
-        for (x, y, w, h) in boxes:
-            if w <= 0:
-                continue
-            subregion = row_img[:, x:x+w]
-            white_pixels = np.sum(subregion == 255)
-            total_pixels = subregion.size
-            if total_pixels == 0:
-                continue
-            density = white_pixels / float(total_pixels)
-            if density >= self.min_col_density:
-                filtered.append((x, y, w, h))
-        return filtered
-    def process_image(self, image_path: str) -> List[List[Tuple[int, int, int, int]]]:
-        """
-        1) Preprocess => bin_img
-        2) Detect row segments (with faint-line logic)
-        3) Filter out rows by density
-        4) Optionally skip the first row (header)
-        5) For each row => detect columns => bounding boxes
-        """
-        img = cv2.imread(image_path)
-        if img is None:
-            raise ValueError(f"Could not read image: {image_path}")
-        bin_img = self.preprocess(img)
-        row_segments = self.detect_full_rows(bin_img)
-        # Filter out rows with insufficient density
-        valid_rows = []
-        for (y1, y2) in row_segments:
-            row_region = bin_img[y1:y2, :]
-            area = row_region.size
-            if area == 0:
-                continue
-            white_pixels = np.sum(row_region == 255)
-            density = white_pixels / float(area)
-            if density >= self.min_row_density:
-                valid_rows.append((y1, y2))
-        # skip header row
-        if self.skip_header and len(valid_rows) > 1:
-            valid_rows = valid_rows[1:]
-        # Detect columns in each valid row
-        all_rows_boxes = []
-        for (y1, y2) in valid_rows:
-            row_img = bin_img[y1:y2, :]
-            col_boxes = self.detect_columns_in_row(row_img, y1, y2)
-            if col_boxes:
-                all_rows_boxes.append(col_boxes)
-        return all_rows_boxes
-    def extract_box_image(self,
-                          original: np.ndarray,
-                          box: Tuple[int, int, int, int]) -> np.ndarray:
-        """
-        Crop bounding box from original with optional padding.
-        """
-        x, y, w, h = box
-        Y1 = max(0, y - self.padding)
-        Y2 = min(original.shape[0], y + h + self.padding)
-        X1 = max(0, x - self.padding)
-        X2 = min(original.shape[1], x + w + self.padding)
-        return original[Y1:Y2, X1:X2]
-    def is_artifact_by_color(self, cell_img: np.ndarray) -> bool:
-        """
-        Revert to the *exact* color-based artifact logic from the first script:
-          1) If the average color is near #a6a6a6 or #a7a7a7 (within color_tolerance),
-             skip it. Otherwise, keep it.
-        """
-        if cell_img.size == 0:
-            return True
-        avg_col = average_bgr(cell_img)
-        dist_a6 = color_distance(avg_col, self.artifact_color_a6)
-        if dist_a6 < self.color_tolerance:
-            return True
-        dist_a7 = color_distance(avg_col, self.artifact_color_a7)
-        if dist_a7 < self.color_tolerance:
-            return True
-        dist_a8 = color_distance(avg_col, self.artifact_color_a8)
-        if dist_a8 < self.color_tolerance:
-            return True
-        dist_a9 = color_distance(avg_col, self.artifact_color_a9)
-        if dist_a9 < self.color_tolerance:
-            return True
-        dist_a10 = color_distance(avg_col, self.artifact_color_a10)
-        if dist_a10 < self.color_tolerance:
-            return True
-        return False
-    def save_extracted_cells(
-        self,
-        image_path: str,
-        row_boxes: List[List[Tuple[int, int, int, int]]],
-        output_dir: str
-    ):
-        """
-        Save each cell from the original image, skipping if it's near #a6a6a6 or #a7a7a7.
-        """
-        out_path = Path(output_dir)
-        out_path.mkdir(exist_ok=True, parents=True)
-        original = cv2.imread(image_path)
-        if original is None:
-            raise ValueError(f"Could not read original image: {image_path}")
-        for i, row in enumerate(row_boxes):
-            row_dir = out_path / f"row_{i}"
-            row_dir.mkdir(exist_ok=True)
-            for j, box in enumerate(row):
-                cell_img = self.extract_box_image(original, box)
-                # Check color-based artifact
-                if self.is_artifact_by_color(cell_img):
-                    logger.info(f"Skipping artifact cell at row={i}, col={j} (color near #a6a6a6/#a7a7a7).")
-                    continue
-                out_file = row_dir / f"col_{j}.png"
-                cv2.imwrite(str(out_file), cell_img)
-                logger.info(f"Saved cell row={i}, col={j} -> {out_file}")
-class TableExtractorApp:
-    def __init__(self, extractor: TableExtractor):
-        self.extractor = extractor
-    def run(self, input_image: str, output_folder: str):
-        row_boxes = self.extractor.process_image(input_image)
-        logger.info(f"Detected {len(row_boxes)} row(s).")
-        self.extractor.save_extracted_cells(input_image, row_boxes, output_folder)
-        logger.info("Done. Check the output folder for results.")
-if __name__ == "__main__":
-    input_image = "images/test/img_9.png"
-    output_folder = "combined_outputs"
-    extractor = TableExtractor(
-        row_morph_iterations=1,
-        min_row_height=15,
-        skip_header=False,
-        merge_two_col_rows=True,
-        enable_subtopic_merge=True,
-        subtopic_threshold=0.2,
-        faint_line_threshold_factor=0.4,
-        top_line_grouping_px=12,
-        some_minimum_text_pixels=50,
-        color_tolerance=30.0
-    )
-    app = TableExtractorApp(extractor)
-    app.run(input_image, output_folder)

topic_extr.py CHANGED Viewed

@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 import os
-import sys
 import json
 import logging
 import gc
@@ -8,58 +7,22 @@ import fitz
 import requests
 import torch
 import boto3
-import re
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-def create_subset_pdf(original_pdf_bytes: bytes, page_indices: list) -> bytes:
-    if not page_indices:
-        raise ValueError("No page indices provided for subset creation.")
-    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
-    new_doc = fitz.open()
-    for p in sorted(set(page_indices)):
-        if 0 <= p < doc.page_count:
-            new_doc.insert_pdf(doc, from_page=p, to_page=p)
-        else:
-            logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
-            raise ValueError(f"Page index {p} out of range.")
-    subset_bytes = new_doc.tobytes()
-    new_doc.close()
-    doc.close()
-    return subset_bytes
-def parse_page_range(page_field) -> list:
-    """
-    Parse the 'page' field from the JSON input.
-    It can be either:
-      • a list of integers:
-          - If the list contains exactly two integers, treat them as a range [start, end] (inclusive start, exclusive end).
-          - Otherwise, treat the list as a sequence of individual pages.
-      • a string:
-          - Either a comma-separated range "start, end" or a comma-separated list of pages.
-    The numbers are assumed to be 1-indexed and are converted to 0-indexed.
-    """
-    if isinstance(page_field, list):
-        if len(page_field) == 2:
-            start, end = page_field
-            return list(range(start - 1, end))
-        else:
-            return [int(p) - 1 for p in page_field]
-    elif isinstance(page_field, str):
-        parts = [p.strip() for p in page_field.split(',')]
-        if len(parts) == 2:
-            start, end = int(parts[0]), int(parts[1])
-            return list(range(start - 1, end))
-        else:
-            return [int(p) - 1 for p in parts]
-    else:
-        logger.error("Invalid type for page field. Must be list or string.")
-        raise ValueError("Invalid page field type.")
 class s3Writer:
     def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
         self.bucket = bucket
@@ -72,7 +35,6 @@ class s3Writer:
     def write(self, path: str, data: bytes) -> None:
         try:
-            from io import BytesIO
             file_obj = BytesIO(data)
             self.client.upload_fileobj(file_obj, self.bucket, path)
             logger.info(f"Uploaded to S3: {path}")
@@ -101,21 +63,42 @@ class S3ImageWriter:
             md_content = md_content.replace(f"![]({key}{path})", f"![]({s3_path})")
         return md_content
 class TopicExtractionProcessor:
-    def __init__(self, gemini_api_key: str, s3_config: dict, output_folder: str):
-        self.gemini_api_key = gemini_api_key
-        self.output_folder = output_folder
-        os.makedirs(self.output_folder, exist_ok=True)
-        self.layout_model = "doclayout_yolo"
-        self.formula_enable = True
-        self.table_enable = False
-        self.language = "en"
-        self.s3_writer = s3Writer(
-            ak=os.getenv("S3_ACCESS_KEY"),
-            sk=os.getenv("S3_SECRET_KEY"),
-            bucket="quextro-resources",
-            endpoint_url=os.getenv("S3_ENDPOINT")
-        )
     def cleanup_gpu(self):
         try:
@@ -123,105 +106,100 @@ class TopicExtractionProcessor:
             torch.cuda.empty_cache()
             logger.info("GPU memory cleaned up.")
         except Exception as e:
-            logger.error(f"Error during GPU cleanup: {e}")
-    def process_input_file(self, input_file: dict) -> str:
-        key = input_file.get("key", "")
-        url = input_file.get("url", "")
-        page_field = input_file.get("page")
-        if not url or not page_field:
-            logger.error("Input file must contain 'url' and 'page' fields.")
-            raise ValueError("Missing 'url' or 'page' in input file.")
-        page_indices = parse_page_range(page_field)
-        logger.info("Using page indices (0-indexed): %s", page_indices)
-        # Retrieve PDF bytes (supports URL or local file)
-        if url.startswith("http://") or url.startswith("https://"):
-            response = requests.get(url)
-            if response.status_code != 200:
-                logger.error("Failed to download PDF from %s. Status code: %d", url, response.status_code)
-                raise Exception(f"Failed to download PDF: {url}")
-            pdf_bytes = response.content
-        else:
-            with open(url, "rb") as f:
-                pdf_bytes = f.read()
-        subset_pdf_bytes = create_subset_pdf(pdf_bytes, page_indices)
-        logger.info("Created subset PDF with %d pages", len(page_indices))
-        dataset = PymuDocDataset(subset_pdf_bytes)
-        inference = doc_analyze(
-            dataset,
-            ocr=True,
-            lang=self.language,
-            layout_model=self.layout_model,
-            formula_enable=self.formula_enable,
-            table_enable=self.table_enable
-        )
-        base_path = f"/topic-extraction/{key}/"
-        writer = S3ImageWriter(self.s3_writer, "/topic-extraction/", self.gemini_api_key)
-        md_prefix = "/topic-extraction/"
-        pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
-        md_content = pipe_result.get_markdown(md_prefix)
-        final_markdown = writer.post_process(md_prefix, md_content)
-        output_md_path = os.path.join(self.output_folder, f"{key}_output.md")
-        with open(output_md_path, "w", encoding="utf-8") as f:
-            f.write(final_markdown)
-        logger.info("Markdown output saved to %s", output_md_path)
-        self.cleanup_gpu()
-        return final_markdown
 def main():
-    message = {
-        "pattern": "topic_extraction",
-        "data": {
-            "input_files": [
-                {
-                    "key": "sample_spec",
-                    "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",
-                    "type": "specification",
-                    "page":  [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42]
-                }
-            ],
-            "topics": [
-                {
-                    "title": "Sample Topic",
-                    "id": 123
-                }
-            ]
-        }
     }
-    data = message.get("data", {})
-    input_files = data.get("input_files", [])
-    output_folder = "output"
-    gemini_api_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-    s3_config = {
-        "ak": os.getenv("S3_ACCESS_KEY"),
-        "sk": os.getenv("S3_SECRET_KEY"),
-        "bucket": "quextro-resources",
-        "endpoint_url": os.getenv("S3_ENDPOINT")
-    }
-    processor = TopicExtractionProcessor(
-        gemini_api_key=gemini_api_key,
-        s3_config=s3_config,
-        output_folder=output_folder
-    )
-    for input_file in message["data"].get("input_files", []):
-        try:
-            logger.info("Processing input file with key: %s", input_file.get("key", ""))
-            final_md = processor.process_input_file(input_file)
-            logger.info("Processing completed for key: %s", input_file.get("key", ""))
-        except Exception as e:
-            logger.error("Error processing input file: %s", e)
 if __name__ == "__main__":
-    main()

 #!/usr/bin/env python3
 import os
 import json
 import logging
 import gc
 import requests
 import torch
 import boto3
+from io import BytesIO
+from typing import Dict, List, Any
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('topic_processor.log')
+    ]
+)
 logger = logging.getLogger(__name__)
 class s3Writer:
     def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
         self.bucket = bucket
     def write(self, path: str, data: bytes) -> None:
         try:
             file_obj = BytesIO(data)
             self.client.upload_fileobj(file_obj, self.bucket, path)
             logger.info(f"Uploaded to S3: {path}")
             md_content = md_content.replace(f"![]({key}{path})", f"![]({s3_path})")
         return md_content
+def delete_non_heading_text(md_content: str) -> str:
+    filtered_lines = []
+    for line in md_content.splitlines():
+        stripped = line.lstrip()
+        if stripped.startswith('#') or stripped.startswith('![]('):
+            filtered_lines.append(line)
+    return "\n".join(filtered_lines)
 class TopicExtractionProcessor:
+    def __init__(self, gemini_api_key: str = None):
+        try:
+            self.s3_writer = s3Writer(
+                ak=os.getenv("S3_ACCESS_KEY"),
+                sk=os.getenv("S3_SECRET_KEY"),
+                bucket="quextro-resources",
+                endpoint_url=os.getenv("S3_ENDPOINT")
+            )
+            config_path = "/home/user/magic-pdf.json"
+            if os.path.exists(config_path):
+                with open(config_path, "r") as f:
+                    config = json.load(f)
+                self.layout_model = config.get("layout-config", {}).get("model", "doclayout_yolo")
+                self.formula_enable = config.get("formula-config", {}).get("enable", True)
+            else:
+                self.layout_model = "doclayout_yolo"
+                self.formula_enable = True
+            self.table_enable = False
+            self.language = "en"
+            self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
+            logger.info("TopicExtractionProcessor initialized successfully")
+        except Exception as e:
+            logger.error("Failed to initialize TopicExtractionProcessor: %s", str(e))
+            raise
     def cleanup_gpu(self):
         try:
             torch.cuda.empty_cache()
             logger.info("GPU memory cleaned up.")
         except Exception as e:
+            logger.error("Error during GPU cleanup: %s", e)
+    def process(self, input_file: Dict[str, Any]) -> str:
+        try:
+            key = input_file.get("key", "")
+            url = input_file.get("url", "")
+            page_field = input_file.get("page")
+            if not url or not page_field:
+                raise ValueError("Missing required 'url' or 'page' in input file")
+            page_indices = self.parse_page_range(page_field)
+            logger.info("Processing %s with pages %s", key, page_indices)
+            if url.startswith(("http://", "https://")):
+                response = requests.get(url)
+                response.raise_for_status()
+                pdf_bytes = response.content
+            else:
+                with open(url, "rb") as f:
+                    pdf_bytes = f.read()
+            subset_pdf = self.create_subset_pdf(pdf_bytes, page_indices)
+            dataset = PymuDocDataset(subset_pdf)
+            inference = doc_analyze(
+                dataset,
+                ocr=True,
+                lang=self.language,
+                layout_model=self.layout_model,
+                formula_enable=self.formula_enable,
+                table_enable=self.table_enable
+            )
+            base_path = f"/topic-extraction/{key}/"
+            writer = S3ImageWriter(self.s3_writer, "/topic-extraction/", self.gemini_api_key)
+            md_prefix = "/topic-extraction/"
+            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
+            md_content = pipe_result.get_markdown(md_prefix)
+            post_processed = writer.post_process(md_prefix, md_content)
+            #remove non-heading text from the markdown output
+            final_markdown = delete_non_heading_text(post_processed)
+            return final_markdown
+        except Exception as e:
+            logger.error("Processing failed for %s: %s", key, str(e))
+            raise
+        finally:
+            self.cleanup_gpu()
+    def create_subset_pdf(self, pdf_bytes: bytes, page_indices: List[int]) -> bytes:
+        """Create a PDF subset from specified pages"""
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        new_doc = fitz.open()
+        try:
+            for p in sorted(set(page_indices)):
+                if 0 <= p < doc.page_count:
+                    new_doc.insert_pdf(doc, from_page=p, to_page=p)
+                else:
+                    raise ValueError(f"Page index {p} out of range (0-{doc.page_count-1})")
+            return new_doc.tobytes()
+        finally:
+            new_doc.close()
+            doc.close()
+    def parse_page_range(self, page_field) -> List[int]:
+        """Parse page range from input (1-indexed to 0-indexed)"""
+        if isinstance(page_field, list):
+            return [int(p) - 1 for p in page_field]
+        if isinstance(page_field, str):
+            parts = [p.strip() for p in page_field.split(',')]
+            return [int(p) - 1 for p in parts]
+        raise ValueError("Invalid page field type")
 def main():
+    """Local test execution without RabbitMQ"""
+    test_input = {
+        "key": "local_test",
+        "url": "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf",  # Local PDF path
+        "page":[15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42]
     }
+    processor = TopicExtractionProcessor()
+    try:
+        logger.info("Starting test processing.")
+        result = processor.process(test_input)
+        logger.info("Processing completed successfully")
+        print("Markdown:\n", result)
+    except Exception as e:
+        logger.error("Test failed: %s", str(e))
 if __name__ == "__main__":
+    main()

topic_extraction.py DELETED Viewed

@@ -1,988 +0,0 @@
-#!/usr/bin/env python3
-import os
-import re
-import gc
-import json
-import logging
-import fitz
-import boto3
-import base64
-import time
-import asyncio
-import tempfile
-import requests
-from io import BytesIO
-from typing import List, Dict, Any
-import torch
-import cv2
-import numpy as np
-from google import genai
-from google.genai import types
-from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.data.data_reader_writer.base import DataWriter
-from table_row_extraction import TableExtractor
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-file_handler = logging.FileHandler("topic_extraction.log")
-file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
-logger.addHandler(file_handler)
-_GEMINI_CLIENT = None
-#helper functions, also global
-def unify_whitespace(text: str) -> str:
-    return re.sub(r"\s+", " ", text).strip()
-def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    st_norm = unify_whitespace(search_text)
-    found = []
-    for i in range(doc.page_count):
-        raw = doc[i].get_text("raw")
-        norm = unify_whitespace(raw)
-        if st_norm in norm:
-            found.append(i)
-    doc.close()
-    return sorted(found)
-def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
-    if not page_indices:
-        raise ValueError("No page indices provided for subset creation.")
-    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
-    new_doc = fitz.open()
-    for p in sorted(set(page_indices)):
-        if 0 <= p < doc.page_count:
-            new_doc.insert_pdf(doc, from_page=p, to_page=p)
-        else:
-            logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
-            raise ValueError(f"Page index {p} out of range.")
-    subset_bytes = new_doc.tobytes()
-    new_doc.close()
-    doc.close()
-    return subset_bytes
-def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
-    """
-    Clean up a topic title:
-    - Remove any trailing "continued".
-    - If the title does not start with a number but children provide a consistent numeric prefix,
-      then prepend that prefix.
-    """
-    title = raw_title.strip()
-    # Remove trailing "continued"
-    title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
-    # If title already starts with a number, use it as is.
-    if re.match(r"^\d+", title):
-        return title
-    # Otherwise, try to deduce a numeric prefix from the children.
-    prefixes = []
-    for child in children_subtopics:
-        child_title = child.get("title", "").strip()
-        m = re.match(r"^(\d+)\.", child_title)
-        if m:
-            prefixes.append(m.group(1))
-    if prefixes:
-        # If all numeric prefixes in children are the same, use that prefix.
-        if all(p == prefixes[0] for p in prefixes):
-            # If title is non-empty, prepend the number; otherwise, use a fallback.
-            if title:
-                title = f"{prefixes[0]} {title}"
-            else:
-                title = f"{prefixes[0]} Topic"
-    # Optionally, handle known broken titles explicitly.
-    if title.lower() in {"gonometry"}:
-        # For example, if children indicate "5.X", set to "5 Trigonometry"
-        if prefixes and prefixes[0] == "5":
-            title = "5 Trigonometry"
-    return title
-def merge_topics(subtopic_list: list) -> list:
-    """
-    Merge topics with an enhanced logic:
-    1. Clean up each topic's title using unify_topic_name.
-    2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
-    3. Reassign children: for each child whose title (e.g. "3.1") does not match its current parent's numeric prefix,
-       move it to the parent with the matching prefix if available.
-    4. Remove duplicate children by merging contents.
-    5. Sort parent topics and each parent's children by their numeric ordering.
-    """
-    # First, merge topics by parent's numeric prefix.
-    merged = {}
-    for topic_obj in subtopic_list:
-        raw_title = topic_obj.get("title", "")
-        children = topic_obj.get("children", [])
-        contents = topic_obj.get("contents", [])
-        new_title = unify_topic_name(raw_title, children)
-        # Extract parent's numeric prefix, if present.
-        m = re.match(r"^(\d+)", new_title)
-        parent_prefix = m.group(1) if m else None
-        key = parent_prefix if parent_prefix is not None else new_title
-        if key not in merged:
-            merged[key] = {
-                "title": new_title,
-                "contents": list(contents),
-                "children": list(children),
-            }
-        else:
-            # Merge contents and children; choose the longer title.
-            if len(new_title) > len(merged[key]["title"]):
-                merged[key]["title"] = new_title
-            merged[key]["contents"].extend(contents)
-            merged[key]["children"].extend(children)
-    # Build a lookup of merged topics by their numeric prefix.
-    parent_lookup = merged  # keys are numeric prefixes or the full title for non-numeric ones.
-    # Reassign children to the correct parent based on their numeric prefix.
-    for key, topic in merged.items():
-        new_children = []
-        for child in topic["children"]:
-            child_title = child.get("title", "").strip()
-            m_child = re.match(r"^(\d+)\.", child_title)
-            if m_child:
-                child_prefix = m_child.group(1)
-                if key != child_prefix and child_prefix in parent_lookup:
-                    # Reassign this child to the proper parent.
-                    parent_lookup[child_prefix]["children"].append(child)
-                    continue
-            new_children.append(child)
-        topic["children"] = new_children
-    # Remove duplicate children by merging their contents.
-    for topic in merged.values():
-        child_map = {}
-        for child in topic["children"]:
-            ctitle = child.get("title", "").strip()
-            if ctitle not in child_map:
-                child_map[ctitle] = child
-            else:
-                child_map[ctitle]["contents"].extend(child.get("contents", []))
-                child_map[ctitle]["children"].extend(child.get("children", []))
-        topic["children"] = list(child_map.values())
-        # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
-        def parse_subtopic_num(subtitle):
-            digits = re.findall(r"\d+", subtitle)
-            return tuple(int(d) for d in digits) if digits else (9999,)
-        topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
-    # Convert merged topics to a sorted list.
-    def parse_parent_num(topic):
-        m = re.match(r"^(\d+)", topic.get("title", ""))
-        return int(m.group(1)) if m else 9999
-    final_list = list(merged.values())
-    final_list.sort(key=lambda topic: parse_parent_num(topic))
-    return final_list
-class s3Writer:
-    def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
-        self.bucket = bucket
-        self.client = boto3.client(
-            's3',
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=endpoint_url
-        )
-    def write(self, path: str, data: bytes) -> None:
-        try:
-            file_obj = BytesIO(data)
-            self.client.upload_fileobj(
-                file_obj,
-                self.bucket,
-                path
-            )
-            logger.info(f"Uploaded to S3: {path}")
-        except Exception as e:
-            logger.error(f"Failed to upload to S3: {str(e)}")
-            raise
-    def delete(self, path: str) -> None:
-        try:
-            self.client.delete_object(Bucket=self.bucket, Key=path)
-        except Exception as e:
-            logger.error(f"Failed to delete from S3: {str(e)}")
-            raise
-def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
-    arr = np.frombuffer(image_data, np.uint8)
-    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-    if img is not None:
-        h, w, _ = img.shape
-        if max(h, w) > max_dim:
-            scale = max_dim / float(max(h, w))
-            new_w = int(w * scale)
-            new_h = int(h * scale)
-            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
-        encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
-        success, enc = cv2.imencode(".jpg", img, encode_params)
-        if success:
-            return enc.tobytes()
-    return image_data
-def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    """
-    Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
-    """
-    for attempt in range(max_retries + 1):
-        try:
-            prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
-The three-column 'table' image includes such key features:
-    - Three columns header
-    - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
-    - Possibly sections (e.g. 8.4, 9.1)
-The two-column 'table' image includes such key features:
-    - Two columns
-    - Headers like 'Subject content', 'Additional information'
-    - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
-If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
-If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
-If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
-Return only one of these exact labels.
-"""
-            global _GEMINI_CLIENT
-            if _GEMINI_CLIENT is None:
-                _GEMINI_CLIENT = genai.Client(api_key=api_key)
-            client = _GEMINI_CLIENT
-            resp = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[
-                    {
-                        "parts": [
-                            {"text": prompt},
-                            {
-                                "inline_data": {
-                                    "mime_type": "image/jpeg",
-                                    "data": base64.b64encode(image_data).decode('utf-8')
-                                }
-                            }
-                        ]
-                    }
-                ],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if resp and resp.text:
-                classification = resp.text.strip().upper()
-                if "THREE" in classification:
-                    return "THREE_COLUMN"
-                elif "TWO" in classification:
-                    return "TWO_COLUMN"
-                elif "EMPTY" in classification:
-                    return "EMPTY_IMAGE"
-            return "NO_TABLE"
-        except Exception as e:
-            logger.error(f"Gemini table classification error: {e}")
-            if "503" in str(e):
-                return "NO_TABLE"
-            if attempt < max_retries:
-                time.sleep(0.5)
-            else:
-                return "NO_TABLE"
-async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    loop = asyncio.get_event_loop()
-    preprocessed = preprocess_image(image_data)
-    return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
-def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
-    for attempt in range(max_retries + 1):
-        try:
-            prompt = """
-You are given an image from an educational curriculum specification for Gemini Flash 2. The image may contain:
-1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
-2) A subtopic heading in the format "<number>.<number>" or "<number>.<number>.<number>", for example "2.5", "2.6", "3.4", "2.1.1", "4.3.3" or "1.2.1".
-3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
-4) Possibly no relevant text or only truncated text (e.g. "Topics", "Subject content", "What students need to learn", "Content Amplification Additional guidance notes", etc.).
-Your task is to extract:
-- **"title"**: A recognized main topic or heading text.
-- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4", "G2", "2.1.1", "4.1.1"), as an array of strings.
-Follow these rules:
-(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued":
-    - Remove the word "continued" if present.
-    - Put that resulting text in "title". (e.g. "2 Algebra and functions")
-    - "subtopics" should be an empty array, unless smaller subtopic numbers (e.g. "2.5") are also detected in the same text.
-(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4":
-    - Collect those exact strings in the JSON key "subtopics" (an array of strings).
-    - "title" in this case should be an empty string if you only detect subtopics.
-      (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
-(3) **If no main topic or subtopic is detected but the text appears to be a heading**, for example "Specialisation, division of labour and exchange", then:
-    - Return:
-      {
-        "title": "<the heading text>",
-        "subtopics": []
-      }
-(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
-    - Use that left column text as "title".
-    - "subtopics" remains empty.
-    Example:
-    If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
-    {
-      "title": "Scarcity, choice and opportunity cost",
-      "subtopics": []
-    }
-(5) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) or it appears to be a standalone column with text, treat it as a heading.
-    - "subtopics" remains empty.
-    Example:
-    If there is only one column image that is "Specialisation, devision of labour and exchange" and the right column is not present, your output is:
-    {
-      "title": "Specialisation, devision of labour and exchange",
-      "subtopics": []
-    }
-(6) **If there is a character + digit pattern** in the left column of a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
-    - Put that label text into "title" (e.g. "G2").
-    - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
-(7) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
-    {
-      "title": "...",
-      "subtopics": [...]
-    }
-(8) **If the image is blank or truncated**, defined as:
-    - Contains no words at all (e.g. a blank white or black image), **OR**
-    - Contains only snippet words/phrases such as "Topics", "Subject content", "Content Amplification Additional guidance notes", "What students need to learn" (including variations in background color), **OR**
-    - Contains partial headings with no recognizable numeric or textual headings
-    - Contains partial UI labels only, such as “Topics” in a gray bar or “What students need to learn” in a blue bar, with no additional meaningful text.
-    then return:
-    {
-      "title": "EMPTY_IMAGE",
-      "subtopics": []
-    }
-(9) **If you cannot recognize any text matching the patterns above**, or the text is too partial/truncated to form a valid heading, also return:
-    {
-      "title": "EMPTY_IMAGE",
-      "subtopics": []
-    }
-**Examples**:
-- If the image text is "2 Algebra and functions continued", return:
-  {
-    "title": "2 Algebra and functions",
-    "subtopics": []
-  }
-- If the image text is "2.5 Solve linear and quadratic inequalities ...", return:
-  {
-    "title": "",
-    "subtopics": ["2.5"]
-  }
-- If the image text is "Specialisation, division of labour and exchange" (with no numeric patterns at all), return:
-  {
-    "title": "Specialisation, division of labour and exchange",
-    "subtopics": []
-  }
-- If the left column says "G2" and the right column has details, but no subtopic numbers, return:
-  {
-    "title": "G2",
-    "subtopics": []
-  }
-- If the image is blank or shows only partial/truncated snippet words (e.g. "Topics", "Content Amplification Additional guidance notes", "Subject content", "What students need to learn") and nothing else, return:
-  {
-    "title": "EMPTY_IMAGE",
-    "subtopics": []
-  }
-"""
-            global _GEMINI_CLIENT
-            if _GEMINI_CLIENT is None:
-                _GEMINI_CLIENT = genai.Client(api_key=api_key)
-            client = _GEMINI_CLIENT
-            resp = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[
-                    {
-                        "parts": [
-                            {"text": prompt},
-                            {
-                                "inline_data": {
-                                    "mime_type": "image/jpeg",
-                                    "data": base64.b64encode(image_data).decode("utf-8")
-                                }
-                            }
-                        ]
-                    }
-                ],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if not resp or not resp.text:
-                logger.warning("Gemini returned an empty response for subtopic extraction.")
-                return {"title": "", "subtopics": []}
-            raw = resp.text.strip()
-            # Remove any markdown fences if present
-            raw = raw.replace("```json", "").replace("```", "").strip()
-            data = json.loads(raw)
-            title = data.get("title", "")
-            subtopics = data.get("subtopics", [])
-            if title.upper() == "EMPTY_IMAGE":
-                return {"title": "EMPTY_IMAGE", "subtopics": []}
-            if not isinstance(subtopics, list):
-                subtopics = []
-            return {"title": title, "subtopics": subtopics}
-        except Exception as e:
-            logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
-            if attempt < max_retries:
-                time.sleep(0.5)
-            else:
-                return {"title": "", "subtopics": []}
-    return {"title": "", "subtopics": []}
-class S3ImageWriter(DataWriter):
-    def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
-        self.s3_writer = s3_writer
-        self.base_path = base_path if base_path.endswith("/") else base_path + "/"
-        self.gemini_api_key = gemini_api_key
-        self.descriptions = {}
-        self._img_count = 0
-        self.extracted_tables = {}
-        self.extracted_subtopics = {}
-    def write(self, path: str, data: bytes) -> None:
-        self._img_count += 1
-        unique_id = f"img_{self._img_count}.jpg"
-        s3_key = f"{self.base_path}{unique_id}"
-        self.s3_writer.write(s3_key, data)
-        self.descriptions[path] = {
-            "data": data,
-            "s3_path": s3_key,
-            "table_classification": "NO_TABLE",
-            "final_alt": ""
-        }
-    async def post_process_async(self, key: str, md_content: str) -> str:
-        logger.info("Classifying images to detect tables.")
-        tasks = {
-            p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
-            for p, info in self.descriptions.items()
-        }
-        results = await asyncio.gather(*tasks.values(), return_exceptions=True)
-        for p, result in zip(list(self.descriptions.keys()), results):
-            if isinstance(result, Exception):
-                logger.error(f"Table classification error for {p}: {result}")
-                self.descriptions[p]['table_classification'] = "NO_TABLE"
-            else:
-                self.descriptions[p]['table_classification'] = result
-        # Process each image description.
-        for p, info in list(self.descriptions.items()):
-            cls = info['table_classification']
-            if cls == "TWO_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
-            elif cls == "THREE_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
-            elif cls == "EMPTY_IMAGE":
-                md_content = md_content.replace(f"![]({key}{p})", "")
-                try:
-                    self.s3_writer.delete(info['s3_path'])
-                except Exception as e:
-                    logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
-                del self.descriptions[p]
-                continue
-            else:
-                info['final_alt'] = "NO_TABLE image"
-            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
-        md_content = await self._process_table_images_in_markdown(key, md_content)
-        # Filter final lines to keep only lines with images.
-        final_lines = [
-            line.strip() for line in md_content.split("\n")
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip())
-        ]
-        return "\n".join(final_lines)
-    async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
-        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
-        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
-        if not matches:
-            return md_content
-        for (col_type, s3_key) in matches:
-            logger.info(f"Processing table image: {s3_key}, columns={col_type}")
-            img_data = None
-            for desc in self.descriptions.values():
-                if desc.get("s3_path") == s3_key:
-                    img_data = desc.get("data")
-                    break
-            if img_data is None:
-                logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
-                continue
-            # Write temporary file for processing.
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
-                temp_file.write(img_data)
-                temp_path = temp_file.name
-            try:
-                if col_type.lower() == 'two':
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=True,
-                        enable_subtopic_merge=True,
-                        subtopic_threshold=0.2
-                    )
-                else:
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=False,
-                        enable_subtopic_merge=False,
-                        subtopic_threshold=0.2
-                    )
-                row_boxes = extractor.process_image(temp_path)
-                # logger.info(f"Extracted {len(row_boxes)} rows from {temp_path}")
-                # for i, row in enumerate(row_boxes):
-                #     logger.info(f"Row {i} has {len(row)} cells")
-                out_folder = temp_path + "_rows"
-                os.makedirs(out_folder, exist_ok=True)
-                # out_folder = os.path.join(os.path.dirname(temp_path), os.path.basename(temp_path) + "_rows")
-                # os.makedirs(out_folder, exist_ok=True)
-                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
-                #just to print structure how cells are saved and named for each table image
-                # logger.info(f"Files in {out_folder}:")
-                # for root, dirs, files in os.walk(out_folder):
-                #     logger.info(f"{root}: {files}")
-                recognized_main_topic = ""
-                main_topic_image_key = None
-                recognized_subtopics = []
-                # Loop over each cell image.
-                for i, row in enumerate(row_boxes):
-                    row_dir = os.path.join(out_folder, f"row_{i}")
-                    for j, _ in enumerate(row):
-                        cell_path = os.path.join(row_dir, f"col_{j}.png")
-                        if not os.path.isfile(cell_path):
-                            alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
-                            if os.path.isfile(alternative_path):
-                                cell_path = alternative_path
-                            else:
-                                logger.warning(f"Cell image not found: {cell_path}")
-                                continue
-                        with open(cell_path, "rb") as cf:
-                            cell_image_data = cf.read()
-                        cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
-                        self.s3_writer.write(cell_key, cell_image_data)
-                        #extract subtopic info from the cell image.
-                        info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
-                        # Check if the image is empty.
-                        if info.get("title", "").upper() == "EMPTY_IMAGE":
-                            try:
-                                self.s3_writer.delete(cell_key)
-                                logger.info(f"Deleted empty cell image from S3: {cell_key}")
-                            except Exception as e:
-                                logger.error(f"Error deleting empty cell image {cell_key}: {e}")
-                            continue  # Skip processing this cell further
-                        if info["title"] and not recognized_main_topic:
-                            recognized_main_topic = info["title"]
-                            main_topic_image_key = cell_key
-                        for st in info["subtopics"]:
-                            recognized_subtopics.append({
-                                "title": st,
-                                "contents": [{"type": "image", "key": cell_key}],
-                                "children": []
-                            })
-                final_json = {
-                    "title": recognized_main_topic,
-                    "contents": [],
-                    "children": recognized_subtopics
-                }
-                if main_topic_image_key:
-                    final_json["contents"].append({"type": "image", "key": main_topic_image_key})
-                # Save the final JSON.
-                self.extracted_subtopics[s3_key] = final_json
-                # Optionally, create a snippet to replace the markdown line.
-                snippet = ["**Extracted table cells:**"]
-                for i, row in enumerate(row_boxes):
-                    for j, _ in enumerate(row):
-                        snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
-                new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
-                md_content = md_content.replace(old_line, new_snip)
-            except Exception as e:
-                logger.error(f"Error processing table image {s3_key}: {e}")
-            finally:
-                os.remove(temp_path)
-        return md_content
-    def post_process(self, key: str, md_content: str) -> str:
-        return asyncio.run(self.post_process_async(key, md_content))
-class GeminiTopicExtractor:
-    def __init__(self, api_key: str = None, num_pages: int = 14):
-        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
-        self.num_pages = num_pages
-    def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
-        first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
-        if not first_pages_text.strip():
-            logger.error("No text from first pages => cannot extract subtopics.")
-            return {}
-        prompt = f"""
-You have the first pages of a PDF specification, including a table of contents.
-Instructions:
-1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
-2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
-3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
-4. Output only valid JSON of the form:
-    {{
-    "Subtopic A": [start_page, end_page],
-    "Subtopic B": [start_page, end_page]
-    }}
-5. If you can't find any subtopics, return an empty JSON.
-Important notes:
-- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
-- The final output must be valid JSON only, with no extra text or code blocks.
-Examples:
-1. Given this table of contents:
-1 Introduction – 2
-    Why choose Edexcel A Level Mathematics? - 2
-    Supporting you in planning and implementing this qualification - 3
-    Qualification at a glance - 5
-2 Subject content and assessment information – 7
-    Paper 1 and Paper 2: Pure Mathematics - 11
-    Paper 3: Statistics and Mechanics - 30
-    Assessment Objectives - 40
-3 Administration and general information – 42
-    Entries - 42
-    Access arrangements, reasonable adjustments, special consideration and malpractice - 42
-    Student recruitment and progression - 45
-Appendix 1: Formulae – 49
-Appendix 2: Notation – 53
-Appendix 3: Use of calculators – 59
-Appendix 4: Assessment Objectives – 60
-Appendix 5: The context for the development of this qualification – 62
-Appendix 6: Transferable skills – 64
-Appendix 7: Level 3 Extended Project qualification – 65
-Appendix 8: Codes – 67
-The correct output should be:
-{{
-    "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-    "Paper 3: Statistics and Mechanics": [30, 42]
-}}
-2. Given this table of contents:
-Qualification at a glance – 1
-    Assessment Objectives and weightings - 4
-Knowledge, skills and understanding – 5
-    Theme 1: Introduction to markets and market failure - 5
-    Theme 2: The UK economy – performance and policies - 11
-    Theme 3: Business behaviour and the labour market - 21
-    Theme 4: A global perspective - 29
-Assessment – 39
-    Assessment summary - 39
-    Assessment objectives - 41
-    Assessment overview - 42
-    Breakdown of assessment objectives - 42
-        Synoptic assessment - 43
-        Discount code and performance tables - 43
-        Access arrangements, reasonable adjustments and special consideration - 44
-        Malpractice - 45
-        Equality Act 2010 and Pearson equality policy - 45
-        Synoptic assessment - 46
-        Awarding and reporting - 47
-Other information – 49
-    Student recruitment -49
-    Prior learning and other requirements -49
-    Progression - 49
-Appendix 1: Transferable skills – 53
-Appendix 2: Level 3 Extended Project qualification – 55
-Appendix 3: Quantitative skills – 59
-Appendix 4: Codes – 61
-Appendix 5: Index – 63
-The correct output should be:
-{{
-    "Theme 1: Introduction to markets and market failure": [5, 10],
-    "Theme 2: The UK economy – performance and policies": [11, 20],
-    "Theme 3: Business behaviour and the labour market": [21, 28],
-    "Theme 4: A global perspective": [29, 38]
-}}
-3. You might also see sections like:
-2.1 AS Unit 1 11
-2.2 AS Unit 2 18
-2.3 A2 Unit 3 24
-2.4 A2 Unit 4 31
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [11, 17],
-    "2.2 AS Unit 2": [18, 23],
-    "2.3 A2 Unit 3": [24, 30],
-    "2.4 A2 Unit 4": [31, 35]
-}}
-or
-2.1 AS units 6
-2.2 AS units 23
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [6, 2],
-    "2.2 AS Unit 2": [23, 43]
-}}
-4. Another example might list subtopics:
-3.1 Overarching themes 11
-3.2 A: Proof 12
-3.3 B: Algebra and functions 13
-3.4 C: Coordinate geometry in the ( x , y ) plane 14
-3.5 D: Sequences and series 15
-3.6 E: Trigonometry 16
-3.7 F: Exponentials and logarithms 17
-3.8 G: Differentiation 18
-3.9 H: Integration 19
-3.10 I: Numerical methods 20
-3.11 J: Vectors 20
-3.12 K: Statistical sampling 21
-3.13 L: Data presentation and interpretation 21
-3.14 M: Probability 22
-3.15 N: Statistical distributions 23
-3.16 O: Statistical hypothesis testing 23
-3.17 P: Quantities and units in mechanics 24
-3.18 Q: Kinematics 24
-3.19 R: Forces and Newton’s laws 24
-3.20 S: Moments 25
-3.21 Use of data in statistics 26
-Here the correct output might look like:
-{{
-    "A: Proof": [12, 12],
-    "B: Algebra and functions": [13, 13],
-    ...
-}}
-Now, extract topics from this text:
-{first_pages_text}
-"""
-        global _GEMINI_CLIENT
-        if _GEMINI_CLIENT is None:
-            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
-        client = _GEMINI_CLIENT
-        try:
-            response = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[prompt],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if not response or not response.text:
-                logger.warning("No text from LLM => returning empty subtopics.")
-                return {}
-            raw_json = response.text.strip()
-            cleaned = raw_json.replace("```json", "").replace("```", "")
-            try:
-                data = json.loads(cleaned)
-            except Exception as json_err:
-                logger.error(f"JSON parsing error: {json_err}")
-                return {}
-            final_dict = {}
-            found_sub_dict = None
-            for k, v in data.items():
-                if isinstance(v, dict):
-                    found_sub_dict = v
-                    break
-            if found_sub_dict is not None:
-                for subk, rng in found_sub_dict.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            else:
-                for subk, rng in data.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            return final_dict
-        except Exception as e:
-            logger.error(f"Gemini subtopic extraction error: {e}")
-            return {}
-    def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
-        text_parts = []
-        try:
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    return ""
-                pdf_bytes = response.content
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            pages_to_read = min(num_pages, doc.page_count)
-            for i in range(pages_to_read):
-                raw_text = doc[i].get_text("raw")
-                text_parts.append(raw_text)
-            doc.close()
-        except Exception as e:
-            logger.error(f"Could not open PDF: {e}")
-        return "\n".join(text_parts)
-class MineruNoTextProcessor:
-    def __init__(self, output_folder: str, gemini_api_key: str):
-        self.output_folder = output_folder
-        os.makedirs(self.output_folder, exist_ok=True)
-        self.layout_model = "doclayout_yolo"
-        self.formula_enable = True
-        self.table_enable = False
-        self.language = "en"
-        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
-        self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
-        self.use_s3 = True
-        self.s3_writer = s3Writer(
-            ak=os.getenv("S3_ACCESS_KEY"),
-            sk=os.getenv("S3_SECRET_KEY"),
-            bucket="quextro-resources",
-            endpoint_url=os.getenv("S3_ENDPOINT")
-        )
-    def cleanup_gpu(self):
-        try:
-            gc.collect()
-            torch.cuda.empty_cache()
-            logger.info("GPU memory cleaned up.")
-        except Exception as e:
-            logger.error(f"Error during GPU cleanup: {e}")
-    def process(self, pdf_path: str) -> Dict[str, Any]:
-        logger.info(f"Processing PDF: {pdf_path}")
-        try:
-            subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
-            logger.info(f"Gemini returned subtopics: {subtopics}")
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    raise Exception(f"Failed to download PDF: {pdf_path}")
-                pdf_bytes = response.content
-                logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-                logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            total_pages = doc.page_count
-            doc.close()
-            # Decide which pages to process
-            final_pages = set()
-            if not subtopics:
-                # fallback
-                final_pages = set(range(total_pages))
-            else:
-                offset_candidates = []
-                for subname, rng in subtopics.items():
-                    start_p, _ = rng
-                    occs = find_all_occurrences(pdf_bytes, subname)
-                    for p in occs:
-                        candidate = p - (start_p - 1)
-                        if candidate > 0:
-                            offset_candidates.append(candidate)
-                if offset_candidates:
-                    try:
-                        from statistics import mode
-                        global_offset = mode(offset_candidates)
-                    except:
-                        from statistics import median
-                        global_offset = int(median(offset_candidates))
-                else:
-                    global_offset = 0
-                logger.info(f"Computed global offset: {global_offset}")
-                for subname, rng in subtopics.items():
-                    if not (isinstance(rng, list) and len(rng) == 2):
-                        continue
-                    start_p, end_p = rng
-                    if start_p > end_p:
-                        continue
-                    s0 = (start_p - 1) + global_offset
-                    e0 = (end_p - 1) + global_offset
-                    for pp in range(s0, e0 + 1):
-                        final_pages.add(pp)
-            if not final_pages:
-                final_pages = set(range(total_pages))
-            logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
-            subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
-            # 4) Analyze and produce markdown
-            dataset = PymuDocDataset(subset_pdf_bytes)
-            inference = doc_analyze(
-                dataset,
-                ocr=True,
-                lang=self.language,
-                layout_model=self.layout_model,
-                formula_enable=self.formula_enable,
-                table_enable=self.table_enable
-            )
-            #S3
-            writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
-            md_prefix = "/topic-extraction/"
-            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
-            md_content = pipe_result.get_markdown(md_prefix)
-            final_markdown = writer.post_process(md_prefix, md_content)
-            subtopic_list = list(writer.extracted_subtopics.values())
-            subtopic_list = merge_topics(subtopic_list)
-            out_path = os.path.join(self.output_folder, "_subtopics.json")
-            with open(out_path, "w", encoding="utf-8") as f:
-                json.dump(subtopic_list, f, indent=2)
-            logger.info(f"Final subtopics JSON saved locally at {out_path}")
-            return {
-                "final_markdown": final_markdown,
-                "subtopics_extracted": subtopic_list
-            }
-        finally:
-            self.cleanup_gpu()
-if __name__ == "__main__":
-    input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
-    output_dir = "/home/user/app/pearson_json"
-    api_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-    try:
-        processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=api_key)
-        result = processor.process(input_pdf)
-        logger.info("Processing completed successfully.")
-    except Exception as e:
-        logger.error(f"Processing failed: {e}")

topic_extraction.log → topic_processor.log RENAMED Viewed

File without changes

worker.py CHANGED Viewed

@@ -10,7 +10,7 @@ from typing import Tuple, Dict, Any
 from mineru_single import Processor
-from topic_extraction import MineruNoTextProcessor
 import logging
@@ -27,10 +27,7 @@ class RabbitMQWorker:
         logger.info("Initializing RabbitMQWorker")
         self.processor = Processor()
-        self.topic_processor = MineruNoTextProcessor(
-            output_folder="/tmp/topic_extraction_outputs",
-            gemini_api_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-        )
         self.publisher_connection = None
         self.publisher_channel = None
@@ -132,35 +129,32 @@ class RabbitMQWorker:
             elif pattern == "topic_extraction":
                 data = body_dict.get("data")
                 input_files = data.get("input_files")
-                logger.info("[Worker %s] Found %d file(s) to process for topic extraction.", thread_id, len(input_files))
-                topics_contexts = []
                 for file in input_files:
                     try:
-                        pdf_url = file.get("url")
-                        logger.info("[Worker %s] Processing topic extraction for URL: %s", thread_id, pdf_url)
-                        result = self.topic_processor.process(pdf_url)
-                        # result = self.topic_processor.process(pdf_url, inputs={"api_key": os.getenv("GEMINI_API_KEY")})
                         context = {
-                            "key": file.get("key", ""),
-                            "body": result
                         }
-                        topics_contexts.append(context)
                     except Exception as e:
-                        err_str = f"Error processing topic extraction for file {file.get('key', '')}: {e}"
                         logger.error(err_str)
-                        topics_contexts.append({"key": file.get("key", ""), "body": err_str})
-                data["topics_markdown"] = topics_contexts
                 body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
                 body_dict["data"] = data
                 if self.publish_message(body_dict, headers):
-                    logger.info("[Worker %s] Successfully published topic extraction results to ml_server.", thread_id)
                     ch.basic_ack(delivery_tag=method.delivery_tag)
                 else:
                     ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
-                logger.info("[Worker %s] Contexts: %s", thread_id, contexts)
             else:
                 ch.basic_ack(delivery_tag=method.delivery_tag, requeue=False)
@@ -219,6 +213,4 @@ def main():
     worker.start()
 if __name__ == "__main__":
-    main()
-__all__ = ['main']

 from mineru_single import Processor
+from topic_extr import TopicExtractionProcessor
 import logging
         logger.info("Initializing RabbitMQWorker")
         self.processor = Processor()
+        self.topic_processor = TopicExtractionProcessor()
         self.publisher_connection = None
         self.publisher_channel = None
             elif pattern == "topic_extraction":
                 data = body_dict.get("data")
                 input_files = data.get("input_files")
+                logger.info("[Worker %s] Found %d file(s) for topic extraction.", thread_id, len(input_files))
                 for file in input_files:
                     try:
                         context = {
+                            "key": file["key"],
+                            "body": self.topic_processor.process(file)
                         }
+                        contexts.append(context)
                     except Exception as e:
+                        err_str = f"Error processing file {file.get('key', '')}: {e}"
                         logger.error(err_str)
+                        contexts.append({"key": file.get("key", ""), "body": err_str})
+                data["md_context"] = contexts
                 body_dict["pattern"] = "topic_extraction_update_from_gpu_server"
                 body_dict["data"] = data
                 if self.publish_message(body_dict, headers):
+                    logger.info("[Worker %s] Published topic extraction results to ml_server.", thread_id)
                     ch.basic_ack(delivery_tag=method.delivery_tag)
                 else:
                     ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True)
+                    logger.error("[Worker %s] Failed to publish topic results.", thread_id)
+                logger.info("[Worker %s] Topic contexts: %s", thread_id, contexts)
             else:
                 ch.basic_ack(delivery_tag=method.delivery_tag, requeue=False)
     worker.start()
 if __name__ == "__main__":
+    main()