MinerU

Paused

App Files Files Community

SkyNait commited on Mar 4, 2025

Commit

98af312

1 Parent(s): 2c394b4

correct JSON and filtering

Browse files

Files changed (6) hide show

topic_extr.py +0 -972
topic_extract_arsenii.py +0 -860
topic_extraction.py +463 -223
topic_extraction_ars.log +0 -746
we/final_subtopics.json +0 -1139
we/we_ars/final_subtopics.json +0 -282

topic_extr.py DELETED Viewed

@@ -1,972 +0,0 @@
-#!/usr/bin/env python3
-import os
-import re
-import gc
-import json
-import logging
-import fitz
-import boto3
-import base64
-import time
-import asyncio
-import tempfile
-import requests
-from io import BytesIO
-from typing import List, Dict, Any
-import torch
-import cv2
-import numpy as np
-from google import genai
-from google.genai import types
-from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.data.data_reader_writer.base import DataWriter
-from table_row_extraction import TableExtractor
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-file_handler = logging.FileHandler("topic_extraction.log")
-file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
-logger.addHandler(file_handler)
-_GEMINI_CLIENT = None
-#helper functions, also global
-def unify_whitespace(text: str) -> str:
-    return re.sub(r"\s+", " ", text).strip()
-def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    st_norm = unify_whitespace(search_text)
-    found = []
-    for i in range(doc.page_count):
-        raw = doc[i].get_text("raw")
-        norm = unify_whitespace(raw)
-        if st_norm in norm:
-            found.append(i)
-    doc.close()
-    return sorted(found)
-def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
-    if not page_indices:
-        raise ValueError("No page indices provided for subset creation.")
-    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
-    new_doc = fitz.open()
-    for p in sorted(set(page_indices)):
-        if 0 <= p < doc.page_count:
-            new_doc.insert_pdf(doc, from_page=p, to_page=p)
-        else:
-            logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
-            raise ValueError(f"Page index {p} out of range.")
-    subset_bytes = new_doc.tobytes()
-    new_doc.close()
-    doc.close()
-    return subset_bytes
-def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
-    """
-    Clean up a topic title:
-    - Remove any trailing "continued".
-    - If the title does not start with a number but children provide a consistent numeric prefix,
-      then prepend that prefix.
-    """
-    title = raw_title.strip()
-    # Remove trailing "continued"
-    title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
-    # If title already starts with a number, use it as is.
-    if re.match(r"^\d+", title):
-        return title
-    # Otherwise, try to deduce a numeric prefix from the children.
-    prefixes = []
-    for child in children_subtopics:
-        child_title = child.get("title", "").strip()
-        m = re.match(r"^(\d+)\.", child_title)
-        if m:
-            prefixes.append(m.group(1))
-    if prefixes:
-        # If all numeric prefixes in children are the same, use that prefix.
-        if all(p == prefixes[0] for p in prefixes):
-            # If title is non-empty, prepend the number; otherwise, use a fallback.
-            if title:
-                title = f"{prefixes[0]} {title}"
-            else:
-                title = f"{prefixes[0]} Topic"
-    # Optionally, handle known broken titles explicitly.
-    if title.lower() in {"gonometry"}:
-        # For example, if children indicate "5.X", set to "5 Trigonometry"
-        if prefixes and prefixes[0] == "5":
-            title = "5 Trigonometry"
-    return title
-def merge_topics(subtopic_list: list) -> list:
-    """
-    Merge topics with an enhanced logic:
-    1. Clean up each topic's title using unify_topic_name.
-    2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
-    3. Reassign children: for each child whose title (e.g. "3.1") does not match its current parent's numeric prefix,
-       move it to the parent with the matching prefix if available.
-    4. Remove duplicate children by merging contents.
-    5. Sort parent topics and each parent's children by their numeric ordering.
-    """
-    # First, merge topics by parent's numeric prefix.
-    merged = {}
-    for topic_obj in subtopic_list:
-        raw_title = topic_obj.get("title", "")
-        children = topic_obj.get("children", [])
-        contents = topic_obj.get("contents", [])
-        new_title = unify_topic_name(raw_title, children)
-        # Extract parent's numeric prefix, if present.
-        m = re.match(r"^(\d+)", new_title)
-        parent_prefix = m.group(1) if m else None
-        key = parent_prefix if parent_prefix is not None else new_title
-        if key not in merged:
-            merged[key] = {
-                "title": new_title,
-                "contents": list(contents),
-                "children": list(children),
-            }
-        else:
-            # Merge contents and children; choose the longer title.
-            if len(new_title) > len(merged[key]["title"]):
-                merged[key]["title"] = new_title
-            merged[key]["contents"].extend(contents)
-            merged[key]["children"].extend(children)
-    # Build a lookup of merged topics by their numeric prefix.
-    parent_lookup = merged  # keys are numeric prefixes or the full title for non-numeric ones.
-    # Reassign children to the correct parent based on their numeric prefix.
-    for key, topic in merged.items():
-        new_children = []
-        for child in topic["children"]:
-            child_title = child.get("title", "").strip()
-            m_child = re.match(r"^(\d+)\.", child_title)
-            if m_child:
-                child_prefix = m_child.group(1)
-                if key != child_prefix and child_prefix in parent_lookup:
-                    # Reassign this child to the proper parent.
-                    parent_lookup[child_prefix]["children"].append(child)
-                    continue
-            new_children.append(child)
-        topic["children"] = new_children
-    # Remove duplicate children by merging their contents.
-    for topic in merged.values():
-        child_map = {}
-        for child in topic["children"]:
-            ctitle = child.get("title", "").strip()
-            if ctitle not in child_map:
-                child_map[ctitle] = child
-            else:
-                child_map[ctitle]["contents"].extend(child.get("contents", []))
-                child_map[ctitle]["children"].extend(child.get("children", []))
-        topic["children"] = list(child_map.values())
-        # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
-        def parse_subtopic_num(subtitle):
-            digits = re.findall(r"\d+", subtitle)
-            return tuple(int(d) for d in digits) if digits else (9999,)
-        topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
-    # Convert merged topics to a sorted list.
-    def parse_parent_num(topic):
-        m = re.match(r"^(\d+)", topic.get("title", ""))
-        return int(m.group(1)) if m else 9999
-    final_list = list(merged.values())
-    final_list.sort(key=lambda topic: parse_parent_num(topic))
-    return final_list
-class s3Writer:
-    def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
-        self.bucket = bucket
-        self.client = boto3.client(
-            's3',
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=endpoint_url
-        )
-    def write(self, path: str, data: bytes) -> None:
-        try:
-            file_obj = BytesIO(data)
-            self.client.upload_fileobj(
-                file_obj,
-                self.bucket,
-                path
-            )
-            logger.info(f"Uploaded to S3: {path}")
-        except Exception as e:
-            logger.error(f"Failed to upload to S3: {str(e)}")
-            raise
-    def delete(self, path: str) -> None:
-        try:
-            self.client.delete_object(Bucket=self.bucket, Key=path)
-        except Exception as e:
-            logger.error(f"Failed to delete from S3: {str(e)}")
-            raise
-def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
-    arr = np.frombuffer(image_data, np.uint8)
-    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-    if img is not None:
-        h, w, _ = img.shape
-        if max(h, w) > max_dim:
-            scale = max_dim / float(max(h, w))
-            new_w = int(w * scale)
-            new_h = int(h * scale)
-            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
-        encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
-        success, enc = cv2.imencode(".jpg", img, encode_params)
-        if success:
-            return enc.tobytes()
-    return image_data
-def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    """
-    Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
-    """
-    for attempt in range(max_retries + 1):
-        try:
-            prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
-The three-column 'table' image includes such key features:
-    - Three columns header
-    - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
-    - Possibly sections (e.g. 8.4, 9.1)
-The two-column 'table' image includes such key features:
-    - Two columns
-    - Headers like 'Subject content', 'Additional information'
-    - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
-If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
-If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
-If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
-Return only one of these exact labels.
-"""
-            global _GEMINI_CLIENT
-            if _GEMINI_CLIENT is None:
-                _GEMINI_CLIENT = genai.Client(api_key=api_key)
-            client = _GEMINI_CLIENT
-            resp = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[
-                    {
-                        "parts": [
-                            {"text": prompt},
-                            {
-                                "inline_data": {
-                                    "mime_type": "image/jpeg",
-                                    "data": base64.b64encode(image_data).decode('utf-8')
-                                }
-                            }
-                        ]
-                    }
-                ],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if resp and resp.text:
-                classification = resp.text.strip().upper()
-                if "THREE" in classification:
-                    return "THREE_COLUMN"
-                elif "TWO" in classification:
-                    return "TWO_COLUMN"
-                elif "EMPTY" in classification:
-                    return "EMPTY_IMAGE"
-            return "NO_TABLE"
-        except Exception as e:
-            logger.error(f"Gemini table classification error: {e}")
-            if "503" in str(e):
-                return "NO_TABLE"
-            if attempt < max_retries:
-                time.sleep(0.5)
-            else:
-                return "NO_TABLE"
-async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    loop = asyncio.get_event_loop()
-    preprocessed = preprocess_image(image_data)
-    return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
-def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
-    for attempt in range(max_retries + 1):
-        try:
-            prompt = """
-You are given an image from an educational curriculum specification. The image may contain:
-1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
-2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6", or "3.4".
-3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
-4) Possibly no relevant text at all.
-Your task is to extract:
-- **"title"**: A recognized main topic or heading text.
-- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4"), as an array of strings.
-Follow these rules:
-(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued", (remove the word "continued") then:
-    - Put that text (without the word "continued") in "title". (e.g. "2 Algebra and functions")
-    - "subtopics" should be an empty array, unless you also see smaller subtopic numbers.
-(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4", then:
-    - Collect those exact strings in the JSON key "subtopics" (an array of strings).
-    - "title" in this case should be an empty string if you only detect subtopics.
-      (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
-(3) If no main topic or subtopic is detected but the text appears to be a heading (e.g. "Scarcity, choice and opportunity cost"), return:
-    {{
-      "title": "",
-      "subtopics": []
-    }}
-(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
-    - Use the **left column text** as "title".
-    - "subtopics" remains empty.
-    Example:
-    If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
-    {
-      "title": "Scarcity, choice and opportunity cost",
-      "subtopics": []
-    }
-(5) **If there is a character + digit pattern** in the left column for a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
-    - Put that label text into "title" (e.g. "G2").
-    - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
-(6) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
-    {
-      "title": "...",
-      "subtopics": [...]
-    }
-(7) If the image is blank or truncated, defined as:
-    - Contains no words at all (e.g. a blank white or black image)
-    - Contains only a truncated snippet of words such as "Topics", "What students need to learn" with blue background
-    - Contains a truncated snippet with words like "Topics", "What students need to learn", "Content" with gray background (RGB (166,166,166) or (180,180,180)) then return:
-    {{
-        "title": "EMPTY_IMAGE",
-        "subtopics": []
-    }}
-**Examples**:
-- If the image text is `"2 Algebra and functions continued"`, return:
-  {
-    "title": "2 Algebra and functions",
-    "subtopics": []
-  }
-- If the image text is `"2.5 Solve linear and quadratic inequalities ..."`, return:
-  {
-    "title": "",
-    "subtopics": ["2.5"]
-  }
-- If the image text is `"Scarcity, choice and opportunity cost"` (with no numeric patterns at all), return:
-  {
-    "title": "Scarcity, choice and opportunity cost",
-    "subtopics": []
-  }
-- If the left column says `"G2"` and the right column has details, but no subtopic numbers, return:
-  {
-    "title": "G2",
-    "subtopics": []
-  }
-- If you cannot recognize any text matching these patterns, or if nothing is found, return:
-  {
-    "title": "",
-    "subtopics": []
-  }
-"""
-            global _GEMINI_CLIENT
-            if _GEMINI_CLIENT is None:
-                _GEMINI_CLIENT = genai.Client(api_key=api_key)
-            client = _GEMINI_CLIENT
-            resp = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[
-                    {
-                        "parts": [
-                            {"text": prompt},
-                            {
-                                "inline_data": {
-                                    "mime_type": "image/jpeg",
-                                    "data": base64.b64encode(image_data).decode("utf-8")
-                                }
-                            }
-                        ]
-                    }
-                ],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if not resp or not resp.text:
-                logger.warning("Gemini returned an empty response for subtopic extraction.")
-                return {"title": "", "subtopics": []}
-            raw = resp.text.strip()
-            # Remove any markdown fences if present
-            raw = raw.replace("```json", "").replace("```", "").strip()
-            data = json.loads(raw)
-            title = data.get("title", "")
-            subtopics = data.get("subtopics", [])
-            if title.upper() == "EMPTY_IMAGE":
-                return {"title": "EMPTY_IMAGE", "subtopics": []}
-            if not isinstance(subtopics, list):
-                subtopics = []
-            return {"title": title, "subtopics": subtopics}
-        except Exception as e:
-            logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
-            if attempt < max_retries:
-                time.sleep(0.5)
-            else:
-                return {"title": "", "subtopics": []}
-    return {"title": "", "subtopics": []}
-class S3ImageWriter(DataWriter):
-    def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
-        self.s3_writer = s3_writer
-        self.base_path = base_path if base_path.endswith("/") else base_path + "/"
-        self.gemini_api_key = gemini_api_key
-        self.descriptions = {}
-        self._img_count = 0
-        self.extracted_tables = {}
-        self.extracted_subtopics = {}
-    def write(self, path: str, data: bytes) -> None:
-        self._img_count += 1
-        unique_id = f"img_{self._img_count}.jpg"
-        s3_key = f"{self.base_path}{unique_id}"
-        self.s3_writer.write(s3_key, data)
-        self.descriptions[path] = {
-            "data": data,
-            "s3_path": s3_key,
-            "table_classification": "NO_TABLE",
-            "final_alt": ""
-        }
-    async def post_process_async(self, key: str, md_content: str) -> str:
-        logger.info("Classifying images to detect tables.")
-        tasks = {
-            p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
-            for p, info in self.descriptions.items()
-        }
-        results = await asyncio.gather(*tasks.values(), return_exceptions=True)
-        for p, result in zip(list(self.descriptions.keys()), results):
-            if isinstance(result, Exception):
-                logger.error(f"Table classification error for {p}: {result}")
-                self.descriptions[p]['table_classification'] = "NO_TABLE"
-            else:
-                self.descriptions[p]['table_classification'] = result
-        # Process each image description.
-        for p, info in list(self.descriptions.items()):
-            cls = info['table_classification']
-            if cls == "TWO_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
-            elif cls == "THREE_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
-            elif cls == "EMPTY_IMAGE":
-                # Remove markdown reference, delete from descriptions and S3.
-                md_content = md_content.replace(f"![]({key}{p})", "")
-                try:
-                    self.s3_writer.delete(info['s3_path'])
-                except Exception as e:
-                    logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
-                del self.descriptions[p]
-                continue
-            else:
-                info['final_alt'] = "NO_TABLE image"
-            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
-        md_content = await self._process_table_images_in_markdown(key, md_content)
-        # Filter final lines to keep only lines with images.
-        final_lines = [
-            line.strip() for line in md_content.split("\n")
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip())
-        ]
-        return "\n".join(final_lines)
-    async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
-        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
-        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
-        if not matches:
-            return md_content
-        for (col_type, s3_key) in matches:
-            logger.info(f"Processing table image: {s3_key}, columns={col_type}")
-            img_data = None
-            for desc in self.descriptions.values():
-                if desc.get("s3_path") == s3_key:
-                    img_data = desc.get("data")
-                    break
-            if img_data is None:
-                logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
-                continue
-            # Write temporary file for processing.
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
-                temp_file.write(img_data)
-                temp_path = temp_file.name
-            try:
-                if col_type.lower() == 'two':
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=True,
-                        enable_subtopic_merge=True,
-                        subtopic_threshold=0.2
-                    )
-                else:
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=False,
-                        enable_subtopic_merge=False,
-                        subtopic_threshold=0.2
-                    )
-                row_boxes = extractor.process_image(temp_path)
-                # logger.info(f"Extracted {len(row_boxes)} rows from {temp_path}")
-                # for i, row in enumerate(row_boxes):
-                #     logger.info(f"Row {i} has {len(row)} cells")
-                out_folder = temp_path + "_rows"
-                os.makedirs(out_folder, exist_ok=True)
-                # out_folder = os.path.join(os.path.dirname(temp_path), os.path.basename(temp_path) + "_rows")
-                # os.makedirs(out_folder, exist_ok=True)
-                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
-                #just to print structure how cells are saved and named for each table image
-                # logger.info(f"Files in {out_folder}:")
-                # for root, dirs, files in os.walk(out_folder):
-                #     logger.info(f"{root}: {files}")
-                recognized_main_topic = ""
-                main_topic_image_key = None
-                recognized_subtopics = []
-                # Loop over each cell image.
-                for i, row in enumerate(row_boxes):
-                    row_dir = os.path.join(out_folder, f"row_{i}")
-                    for j, _ in enumerate(row):
-                        cell_path = os.path.join(row_dir, f"col_{j}.png")
-                        if not os.path.isfile(cell_path):
-                            alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
-                            if os.path.isfile(alternative_path):
-                                cell_path = alternative_path
-                            else:
-                                logger.warning(f"Cell image not found: {cell_path}")
-                                continue
-                        with open(cell_path, "rb") as cf:
-                            cell_image_data = cf.read()
-                        cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
-                        self.s3_writer.write(cell_key, cell_image_data)
-                        #extract subtopic info from the cell image.
-                        info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
-                        # Check if the image is empty.
-                        if info.get("title", "").upper() == "EMPTY_IMAGE":
-                            try:
-                                self.s3_writer.delete(cell_key)
-                                logger.info(f"Deleted empty cell image from S3: {cell_key}")
-                            except Exception as e:
-                                logger.error(f"Error deleting empty cell image {cell_key}: {e}")
-                            continue  # Skip processing this cell further
-                        if info["title"] and not recognized_main_topic:
-                            recognized_main_topic = info["title"]
-                            main_topic_image_key = cell_key
-                        for st in info["subtopics"]:
-                            recognized_subtopics.append({
-                                "title": st,
-                                "contents": [{"type": "image", "key": cell_key}],
-                                "children": []
-                            })
-                final_json = {
-                    "title": recognized_main_topic,
-                    "contents": [],
-                    "children": recognized_subtopics
-                }
-                if main_topic_image_key:
-                    final_json["contents"].append({"type": "image", "key": main_topic_image_key})
-                # Save the final JSON.
-                self.extracted_subtopics[s3_key] = final_json
-                # Optionally, create a snippet to replace the markdown line.
-                snippet = ["**Extracted table cells:**"]
-                for i, row in enumerate(row_boxes):
-                    for j, _ in enumerate(row):
-                        snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
-                new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
-                md_content = md_content.replace(old_line, new_snip)
-            except Exception as e:
-                logger.error(f"Error processing table image {s3_key}: {e}")
-            finally:
-                os.remove(temp_path)
-        return md_content
-    def post_process(self, key: str, md_content: str) -> str:
-        return asyncio.run(self.post_process_async(key, md_content))
-class GeminiTopicExtractor:
-    def __init__(self, api_key: str = None, num_pages: int = 14):
-        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
-        self.num_pages = num_pages
-    def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
-        first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
-        if not first_pages_text.strip():
-            logger.error("No text from first pages => cannot extract subtopics.")
-            return {}
-        prompt = f"""
-You have the first pages of a PDF specification, including a table of contents.
-Instructions:
-1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
-2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
-3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
-4. Output only valid JSON of the form:
-    {{
-    "Subtopic A": [start_page, end_page],
-    "Subtopic B": [start_page, end_page]
-    }}
-5. If you can't find any subtopics, return an empty JSON.
-Important notes:
-- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
-- The final output must be valid JSON only, with no extra text or code blocks.
-Examples:
-1. Given this table of contents:
-1 Introduction – 2
-    Why choose Edexcel A Level Mathematics? - 2
-    Supporting you in planning and implementing this qualification - 3
-    Qualification at a glance - 5
-2 Subject content and assessment information – 7
-    Paper 1 and Paper 2: Pure Mathematics - 11
-    Paper 3: Statistics and Mechanics - 30
-    Assessment Objectives - 40
-3 Administration and general information – 42
-    Entries - 42
-    Access arrangements, reasonable adjustments, special consideration and malpractice - 42
-    Student recruitment and progression - 45
-Appendix 1: Formulae – 49
-Appendix 2: Notation – 53
-Appendix 3: Use of calculators – 59
-Appendix 4: Assessment Objectives – 60
-Appendix 5: The context for the development of this qualification – 62
-Appendix 6: Transferable skills – 64
-Appendix 7: Level 3 Extended Project qualification – 65
-Appendix 8: Codes – 67
-The correct output should be:
-{{
-    "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-    "Paper 3: Statistics and Mechanics": [30, 42]
-}}
-2. Given this table of contents:
-Qualification at a glance – 1
-    Assessment Objectives and weightings - 4
-Knowledge, skills and understanding – 5
-    Theme 1: Introduction to markets and market failure - 5
-    Theme 2: The UK economy – performance and policies - 11
-    Theme 3: Business behaviour and the labour market - 21
-    Theme 4: A global perspective - 29
-Assessment – 39
-    Assessment summary - 39
-    Assessment objectives - 41
-    Assessment overview - 42
-    Breakdown of assessment objectives - 42
-        Synoptic assessment - 43
-        Discount code and performance tables - 43
-        Access arrangements, reasonable adjustments and special consideration - 44
-        Malpractice - 45
-        Equality Act 2010 and Pearson equality policy - 45
-        Synoptic assessment - 46
-        Awarding and reporting - 47
-Other information – 49
-    Student recruitment -49
-    Prior learning and other requirements -49
-    Progression - 49
-Appendix 1: Transferable skills – 53
-Appendix 2: Level 3 Extended Project qualification – 55
-Appendix 3: Quantitative skills – 59
-Appendix 4: Codes – 61
-Appendix 5: Index – 63
-The correct output should be:
-{{
-    "Theme 1: Introduction to markets and market failure": [5, 10],
-    "Theme 2: The UK economy – performance and policies": [11, 20],
-    "Theme 3: Business behaviour and the labour market": [21, 28],
-    "Theme 4: A global perspective": [29, 38]
-}}
-3. You might also see sections like:
-2.1 AS Unit 1 11
-2.2 AS Unit 2 18
-2.3 A2 Unit 3 24
-2.4 A2 Unit 4 31
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [11, 17],
-    "2.2 AS Unit 2": [18, 23],
-    "2.3 A2 Unit 3": [24, 30],
-    "2.4 A2 Unit 4": [31, 35]
-}}
-or
-2.1 AS units 6
-2.2 AS units 23
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [6, 2],
-    "2.2 AS Unit 2": [23, 43]
-}}
-4. Another example might list subtopics:
-3.1 Overarching themes 11
-3.2 A: Proof 12
-3.3 B: Algebra and functions 13
-3.4 C: Coordinate geometry in the ( x , y ) plane 14
-3.5 D: Sequences and series 15
-3.6 E: Trigonometry 16
-3.7 F: Exponentials and logarithms 17
-3.8 G: Differentiation 18
-3.9 H: Integration 19
-3.10 I: Numerical methods 20
-3.11 J: Vectors 20
-3.12 K: Statistical sampling 21
-3.13 L: Data presentation and interpretation 21
-3.14 M: Probability 22
-3.15 N: Statistical distributions 23
-3.16 O: Statistical hypothesis testing 23
-3.17 P: Quantities and units in mechanics 24
-3.18 Q: Kinematics 24
-3.19 R: Forces and Newton’s laws 24
-3.20 S: Moments 25
-3.21 Use of data in statistics 26
-Here the correct output might look like:
-{{
-    "A: Proof": [12, 12],
-    "B: Algebra and functions": [13, 13],
-    ...
-}}
-Now, extract topics from this text:
-{first_pages_text}
-"""
-        global _GEMINI_CLIENT
-        if _GEMINI_CLIENT is None:
-            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
-        client = _GEMINI_CLIENT
-        try:
-            response = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[prompt],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if not response or not response.text:
-                logger.warning("No text from LLM => returning empty subtopics.")
-                return {}
-            raw_json = response.text.strip()
-            cleaned = raw_json.replace("```json", "").replace("```", "")
-            try:
-                data = json.loads(cleaned)
-            except Exception as json_err:
-                logger.error(f"JSON parsing error: {json_err}")
-                return {}
-            final_dict = {}
-            found_sub_dict = None
-            for k, v in data.items():
-                if isinstance(v, dict):
-                    found_sub_dict = v
-                    break
-            if found_sub_dict is not None:
-                for subk, rng in found_sub_dict.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            else:
-                for subk, rng in data.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            return final_dict
-        except Exception as e:
-            logger.error(f"Gemini subtopic extraction error: {e}")
-            return {}
-    def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
-        text_parts = []
-        try:
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    return ""
-                pdf_bytes = response.content
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            pages_to_read = min(num_pages, doc.page_count)
-            for i in range(pages_to_read):
-                raw_text = doc[i].get_text("raw")
-                text_parts.append(raw_text)
-            doc.close()
-        except Exception as e:
-            logger.error(f"Could not open PDF: {e}")
-        return "\n".join(text_parts)
-class MineruNoTextProcessor:
-    def __init__(self, output_folder: str, gemini_api_key: str):
-        self.output_folder = output_folder
-        os.makedirs(self.output_folder, exist_ok=True)
-        self.layout_model = "doclayout_yolo"
-        self.formula_enable = True
-        self.table_enable = False
-        self.language = "en"
-        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
-        self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
-        self.use_s3 = True
-        self.s3_writer = s3Writer(
-            ak=os.getenv("S3_ACCESS_KEY"),
-            sk=os.getenv("S3_SECRET_KEY"),
-            bucket="quextro-resources",
-            endpoint_url=os.getenv("S3_ENDPOINT")
-        )
-    def cleanup_gpu(self):
-        try:
-            gc.collect()
-            torch.cuda.empty_cache()
-            logger.info("GPU memory cleaned up.")
-        except Exception as e:
-            logger.error(f"Error during GPU cleanup: {e}")
-    def process(self, pdf_path: str) -> Dict[str, Any]:
-        logger.info(f"Processing PDF: {pdf_path}")
-        try:
-            # Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
-            subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
-            logger.info(f"Gemini returned subtopics: {subtopics}")
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    raise Exception(f"Failed to download PDF: {pdf_path}")
-                pdf_bytes = response.content
-                logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-                logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            total_pages = doc.page_count
-            doc.close()
-            # Decide which pages to process
-            final_pages = set()
-            if not subtopics:
-                # fallback
-                final_pages = set(range(total_pages))
-            else:
-                offset_candidates = []
-                for subname, rng in subtopics.items():
-                    start_p, _ = rng
-                    occs = find_all_occurrences(pdf_bytes, subname)
-                    for p in occs:
-                        candidate = p - (start_p - 1)
-                        if candidate > 0:
-                            offset_candidates.append(candidate)
-                if offset_candidates:
-                    try:
-                        from statistics import mode
-                        global_offset = mode(offset_candidates)
-                    except:
-                        from statistics import median
-                        global_offset = int(median(offset_candidates))
-                else:
-                    global_offset = 0
-                logger.info(f"Computed global offset: {global_offset}")
-                for subname, rng in subtopics.items():
-                    if not (isinstance(rng, list) and len(rng) == 2):
-                        continue
-                    start_p, end_p = rng
-                    if start_p > end_p:
-                        continue
-                    s0 = (start_p - 1) + global_offset
-                    e0 = (end_p - 1) + global_offset
-                    for pp in range(s0, e0 + 1):
-                        final_pages.add(pp)
-            if not final_pages:
-                final_pages = set(range(total_pages))
-            logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
-            subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
-            # 4) Analyze and produce markdown
-            dataset = PymuDocDataset(subset_pdf_bytes)
-            inference = doc_analyze(
-                dataset,
-                ocr=True,
-                lang=self.language,
-                layout_model=self.layout_model,
-                formula_enable=self.formula_enable,
-                table_enable=self.table_enable
-            )
-            #S3
-            writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
-            md_prefix = "/topic-extraction/"
-            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
-            md_content = pipe_result.get_markdown(md_prefix)
-            final_markdown = writer.post_process(md_prefix, md_content)
-            subtopic_list = list(writer.extracted_subtopics.values())
-            subtopic_list = merge_topics(subtopic_list)
-            out_path = os.path.join(self.output_folder, "_subtopics.json")
-            with open(out_path, "w", encoding="utf-8") as f:
-                json.dump(subtopic_list, f, indent=2)
-            logger.info(f"Final subtopics JSON saved locally at {out_path}")
-            return {
-                "final_markdown": final_markdown,
-                "subtopics_extracted": subtopic_list
-            }
-        finally:
-            self.cleanup_gpu()
-if __name__ == "__main__":
-    input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
-    output_dir = "/home/user/app/pearson_json"
-    gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-    try:
-        processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
-        result = processor.process(input_pdf)
-        logger.info("Processing completed successfully.")
-    except Exception as e:
-        logger.error(f"Processing failed: {e}")

topic_extract_arsenii.py DELETED Viewed

@@ -1,860 +0,0 @@
-#!/usr/bin/env python3
-import os
-import re
-import gc
-import json
-import logging
-import fitz
-import boto3
-import base64
-import time
-import asyncio
-import tempfile
-import requests
-from io import BytesIO
-from typing import List, Dict, Any
-import torch
-import cv2
-import numpy as np
-from google import genai
-from google.genai import types
-from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.data.data_reader_writer.base import DataWriter
-from table_row_extraction import TableExtractor
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-file_handler = logging.FileHandler("topic_extraction_ars.log")
-file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
-logger.addHandler(file_handler)
-_GEMINI_CLIENT = None
-def unify_whitespace(text: str) -> str:
-    return re.sub(r"\s+", " ", text).strip()
-def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    st_norm = unify_whitespace(search_text)
-    found = []
-    for i in range(doc.page_count):
-        raw = doc[i].get_text("raw")
-        norm = unify_whitespace(raw)
-        if st_norm in norm:
-            found.append(i)
-    doc.close()
-    return sorted(found)
-def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
-    if not page_indices:
-        raise ValueError("No page indices provided for subset creation.")
-    doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
-    new_doc = fitz.open()
-    for p in sorted(set(page_indices)):
-        if 0 <= p < doc.page_count:
-            new_doc.insert_pdf(doc, from_page=p, to_page=p)
-        else:
-            logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
-            raise ValueError(f"Page index {p} out of range.")
-    subset_bytes = new_doc.tobytes()
-    new_doc.close()
-    doc.close()
-    return subset_bytes
-class s3Writer:
-    def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
-        self.bucket = bucket
-        self.client = boto3.client(
-            's3',
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=endpoint_url
-        )
-    def write(self, path: str, data: bytes) -> None:
-        try:
-            file_obj = BytesIO(data)
-            self.client.upload_fileobj(
-                file_obj,
-                self.bucket,
-                path
-            )
-            logger.info(f"Uploaded to S3: {path}")
-        except Exception as e:
-            logger.error(f"Failed to upload to S3: {str(e)}")
-            raise
-def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
-    arr = np.frombuffer(image_data, np.uint8)
-    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
-    if img is not None:
-        h, w, _ = img.shape
-        if max(h, w) > max_dim:
-            scale = max_dim / float(max(h, w))
-            new_w = int(w * scale)
-            new_h = int(h * scale)
-            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
-        encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
-        success, enc = cv2.imencode(".jpg", img, encode_params)
-        if success:
-            return enc.tobytes()
-    return image_data
-def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    """
-    Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
-    """
-    for attempt in range(max_retries + 1):
-        try:
-            prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
-The three-column 'table' image includes such key features:
-    - Three columns header
-    - Headers like 'Topics', 'Content', 'Guidelines'
-    - Possibly sections (e.g. 8.4, 9.1)
-The two-column 'table' image includes such key features:
-    - Two columns
-    - Headers like 'Subject content' and 'Additional information'
-    - Possibly sections (e.g. 2.1, 3.4)
-If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
-If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
-If the image does not show a table at all, respond with 'NO_TABLE'.
-Return only one of these exact labels.
-"""
-            global _GEMINI_CLIENT
-            if _GEMINI_CLIENT is None:
-                _GEMINI_CLIENT = genai.Client(api_key=api_key)
-            client = _GEMINI_CLIENT
-            resp = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[
-                    {
-                        "parts": [
-                            {"text": prompt},
-                            {
-                                "inline_data": {
-                                    "mime_type": "image/jpeg",
-                                    "data": base64.b64encode(image_data).decode('utf-8')
-                                }
-                            }
-                        ]
-                    }
-                ],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if resp and resp.text:
-                classification = resp.text.strip().upper()
-                if "THREE" in classification:
-                    return "THREE_COLUMN"
-                elif "TWO" in classification:
-                    return "TWO_COLUMN"
-            return "NO_TABLE"
-        except Exception as e:
-            logger.error(f"Gemini table classification error: {e}")
-            if "503" in str(e):
-                return "NO_TABLE"
-            if attempt < max_retries:
-                time.sleep(0.5)
-            else:
-                return "NO_TABLE"
-async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
-    loop = asyncio.get_event_loop()
-    preprocessed = preprocess_image(image_data)
-    return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
-def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
-    """
-    Sends the *image* (not text) of a table cell to Gemini to identify:
-      - A main topic heading in the format: "<number> <Topic Name>", e.g. "2 Algebra and functions"
-      - A subtopic heading in the format: "<number>.<number>", e.g. "2.5", "3.4"
-    Returns a dict of the form:
-      {
-        "title": "<the recognized main topic or empty if not found>",
-        "subtopics": ["2.5", "2.6", ...]
-      }
-    """
-    # Prompt specifically instructs Gemini to read the image’s text and extract
-    # either a main topic or subtopic heading if present:
-    prompt = """
-        You are given an image of a table cell from an educational curriculum specification.
-        The text in this cell may contain:
-        1) A main topic heading in the format "<number> <Topic Name>", for example: "2 Algebra and functions"
-        2) A subtopic heading in the format "<number>.<number>", for example: "2.5" or "3.4"
-        Identify if the cell contains exactly one main topic or subtopic.
-        Return a valid JSON object with the keys "title" and "subtopics" of the form:
-        {{
-        "title": "2 Algebra and functions",
-        "subtopics": ["2.5", "2.6"]
-        }}
-        If you find a main topic (like '2 Algebra and functions'), put it in "title".
-        If you find subtopic numbers (like '2.5', '3.4'), put them in the "subtopics" array.
-        """
-    # Re-use or initialize your global Gemini client:
-    client = genai.Client(api_key=api_key)
-    # Send the prompt + image to Gemini:
-    resp = client.models.generate_content(
-        model="gemini-2.0-flash",
-        contents=[
-            {
-                "parts": [
-                    {"text": prompt},
-                    {
-                        "inline_data": {
-                            "mime_type": "image/jpeg",
-                            "data": base64.b64encode(image_data).decode("utf-8")
-                        }
-                    }
-                ]
-            }
-        ],
-        config=types.GenerateContentConfig(temperature=0.0)
-    )
-    raw = resp.text.strip().replace("```json", "").replace("```", "")
-    logger.info(f"== RAW == {raw}")
-    print(f"== RAW == {raw}")
-    data = json.loads(raw)
-    title = data["title"]
-    subtopics = data["subtopics"]
-    return {"title": title, "subtopics": subtopics}
-class S3ImageWriter(DataWriter):
-    def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
-        self.s3_writer = s3_writer
-        self.base_path = base_path if base_path.endswith("/") else base_path + "/"
-        self.gemini_api_key = gemini_api_key
-        self.descriptions = {}
-        self._img_count = 0
-        self.extracted_tables = {}
-        # New attribute to store final subtopic JSON
-        self.extracted_subtopics = {}
-    def write(self, path: str, data: bytes) -> None:
-        self._img_count += 1
-        unique_id = f"img_{self._img_count}.jpg"
-        s3_key = f"{self.base_path}{unique_id}"
-        self.s3_writer.write(s3_key, data)
-        self.descriptions[path] = {
-            "data": data,
-            "s3_path": s3_key,
-            "table_classification": "NO_TABLE",
-            "final_alt": ""
-        }
-    async def post_process_async(self, key: str, md_content: str) -> str:
-        logger.info("Classifying images to detect tables.")
-        tasks = {
-            p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
-            for p, info in self.descriptions.items()
-        }
-        results = await asyncio.gather(*tasks.values(), return_exceptions=True)
-        for p, result in zip(tasks.keys(), results):
-            if isinstance(result, Exception):
-                logger.error(f"Table classification error for {p}: {result}")
-                self.descriptions[p]['table_classification'] = "NO_TABLE"
-            else:
-                self.descriptions[p]['table_classification'] = result
-        # 2) Replace the original markdown references with alt text
-        for p, info in self.descriptions.items():
-            cls = info['table_classification']
-            if cls == "TWO_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
-            elif cls == "THREE_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
-            else:
-                info['final_alt'] = "NO_TABLE image"
-            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
-        md_content = await self._process_table_images_in_markdown(key, md_content)
-        # Filter final lines to keep only lines with images
-        final_lines = [
-            line.strip() for line in md_content.split("\n")
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip())
-        ]
-        return "\n".join(final_lines)
-    async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
-        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
-        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
-        if not matches:
-            return md_content
-        for (col_type, s3_key) in matches:
-            logger.info(f"Processing table image: {s3_key}, columns={col_type}")
-            img_data = None
-            for desc in self.descriptions.values():
-                if desc.get("s3_path") == s3_key:
-                    img_data = desc.get("data")
-                    break
-            if img_data is None:
-                logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
-                continue
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
-                temp_file.write(img_data)
-                temp_path = temp_file.name
-            try:
-                if col_type.lower() == 'two':
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=True,
-                        enable_subtopic_merge=True,
-                        subtopic_threshold=0.2
-                    )
-                else:
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=False,
-                        enable_subtopic_merge=False,
-                        subtopic_threshold=0.2
-                    )
-                row_boxes = extractor.process_image(temp_path)
-                #save cell images to S3 or local
-                out_folder = temp_path + "_rows"
-                os.makedirs(out_folder, exist_ok=True)
-                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
-                recognized_main_topic = None
-                recognized_subtopics = []
-                for i, row in enumerate(row_boxes):
-                    row_dir = os.path.join(out_folder, f"row_{i}")
-                    for j, _ in enumerate(row):
-                        cell_path = os.path.join(row_dir, f"col_{j}.jpg")
-                        # if not os.path.isfile(cell_path):
-                        #     continue
-                        with open(cell_path, "rb") as cf:
-                            cell_image_data = cf.read()
-                        # store that cell image to S3
-                        cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg"
-                        self.s3_writer.write(cell_key, cell_image_data)
-                        # Call Gemini with the cell image
-                        info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
-                        logger.debug(f"== INFO == {info}")
-                        # e.g. info = {"title": "2 Algebra and functions", "subtopics": ["2.5"]}
-                        # 3d) Merge the recognized topic/subtopics
-                        if info["title"]:
-                            recognized_main_topic = info["title"]
-                        if info["subtopics"]:
-                            recognized_subtopics.extend(info["subtopics"])
-                snippet = ["**Extracted table cells:**"]
-                cell_texts = []
-                for i, row in enumerate(row_boxes):
-                    for j, box in enumerate(row):
-                        cell_key = f"{self.base_path}cells/table_s3_{os.path.basename(s3_key)}_r{i}_c{j}.jpg"
-                        self.s3_writer.write(cell_key, img_data)  # or cell_data if you truly cropped
-                        text = "..."  # placeholder
-                        cell_texts.append(text)
-                        snippet.append(f"![Row {i} Col {j}]({cell_key})")
-                final_json = {
-                "title": recognized_main_topic,
-                "contents": [
-                    {
-                        "type": "image",
-                        "key": s3_key
-                    }
-                ],
-                "children": []
-                }
-                for st in recognized_subtopics:
-                    final_json["children"].append({
-                        "title": st,
-                        "contents": [
-                            {"type": "image", "key": f"subtopic_{st}_example.jpg"}
-                        ]
-                    })
-                self.extracted_subtopics[s3_key] = final_json
-                # Replace the original table image line in the markdown with the snippet
-                new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
-                md_content = md_content.replace(old_line, new_snip)
-                snippet = ["**Extracted table cells:**"]
-                for i, row in enumerate(row_boxes):
-                    for j, _ in enumerate(row):
-                        snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
-                new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
-                md_content = md_content.replace(old_line, new_snip)
-            except Exception as e:
-                logger.error(f"Error processing table image {s3_key}: {e}")
-            finally:
-                os.remove(temp_path)
-        return md_content
-    def post_process(self, key: str, md_content: str) -> str:
-        return asyncio.run(self.post_process_async(key, md_content))
-class LocalImageWriter(DataWriter):
-    def __init__(self, output_folder: str, gemini_api_key: str):
-        self.output_folder = output_folder
-        os.makedirs(self.output_folder, exist_ok=True)
-        self.descriptions = {}
-        self._img_count = 0
-        self.gemini_api_key = gemini_api_key
-        self.extracted_tables = {}
-    def write(self, path: str, data: bytes) -> None:
-        self._img_count += 1
-        unique_id = f"img_{self._img_count}.jpg"
-        self.descriptions[path] = {
-            "data": data,
-            "relative_path": unique_id,
-            "table_classification": "NO_TABLE",
-            "final_alt": ""
-        }
-        # Also save the original image locally for testing.
-        image_path = os.path.join(self.output_folder, unique_id)
-        with open(image_path, "wb") as f:
-            f.write(data)
-    async def post_process_async(self, key: str, md_content: str) -> str:
-        logger.info("Classifying images to detect tables.")
-        tasks = []
-        for p, info in self.descriptions.items():
-            tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
-        for p, task in tasks:
-            try:
-                classification = await task
-                self.descriptions[p]['table_classification'] = classification
-            except Exception as e:
-                logger.error(f"Table classification error: {e}")
-                self.descriptions[p]['table_classification'] = "NO_TABLE"
-        for p, info in self.descriptions.items():
-            cls = info['table_classification']
-            if cls == "TWO_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
-            elif cls == "THREE_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
-            else:
-                info['final_alt'] = "NO_TABLE image"
-            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['relative_path']})")
-        md_content = self._process_table_images_in_markdown(md_content)
-        final_lines = []
-        for line in md_content.split("\n"):
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
-                final_lines.append(line.strip())
-        return "\n".join(final_lines)
-    def _process_table_images_in_markdown(self, md_content: str) -> str:
-        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
-        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
-        if not matches:
-            return md_content
-        for (col_type, image_id) in matches:
-            logger.info(f"Processing table image => {image_id}, columns={col_type}")
-            temp_path = os.path.join(self.output_folder, image_id)
-            desc_item = None
-            for k, val in self.descriptions.items():
-                if val["relative_path"] == image_id:
-                    desc_item = val
-                    break
-            if not desc_item:
-                logger.warning(f"No matching image data for {image_id}, skipping extraction.")
-                continue
-            if not os.path.exists(temp_path):
-                with open(temp_path, "wb") as f:
-                    f.write(desc_item["data"])
-            try:
-                if col_type.lower() == 'two':       #check for table_row_extr script for more details
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=True,
-                        enable_subtopic_merge=True,
-                        subtopic_threshold=0.2
-                    )
-                else:
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=False,
-                        enable_subtopic_merge=False,
-                        subtopic_threshold=0.2
-                    )
-                row_boxes = extractor.process_image(temp_path)
-                out_folder = temp_path + "_rows"
-                os.makedirs(out_folder, exist_ok=True)
-                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
-                # List all extracted cell images relative to the output folder.
-                extracted_cells = []
-                for root, dirs, files in os.walk(out_folder):
-                    for file in files:
-                        rel_path = os.path.relpath(os.path.join(root, file), self.output_folder)
-                        extracted_cells.append(rel_path)
-                # Save mapping for testing.
-                self.extracted_tables[image_id] = extracted_cells
-                snippet = ["**Extracted table cells:**"]
-                for i, row in enumerate(row_boxes):
-                    row_dir = os.path.join(out_folder, f"row_{i}")
-                    for j, _ in enumerate(row):
-                        cell_file = f"col_{j}.jpg"
-                        cell_path = os.path.join(row_dir, cell_file)
-                        relp = os.path.relpath(cell_path, self.output_folder)
-                        snippet.append(f"![Row {i} Col {j}]({relp})")
-                new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
-                md_content = md_content.replace(old_line, new_snip)
-            except Exception as e:
-                logger.error(f"Error processing table image {image_id}: {e}")
-            finally:
-                if os.path.exists(temp_path):
-                    os.remove(temp_path)
-        return md_content
-    def post_process(self, key: str, md_content: str) -> str:
-        return asyncio.run(self.post_process_async(key, md_content))
-class GeminiTopicExtractor:
-    def __init__(self, api_key: str = None, num_pages: int = 14):
-        self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
-        self.num_pages = num_pages
-    def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
-        first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
-        if not first_pages_text.strip():
-            logger.error("No text from first pages => cannot extract subtopics.")
-            return {}
-        prompt = f"""
-You have the first pages of a PDF specification, including a table of contents.
-Instructions:
-1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
-2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
-3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
-4. Output only valid JSON of the form:
-    {{
-    "Subtopic A": [start_page, end_page],
-    "Subtopic B": [start_page, end_page]
-    }}
-5. If you can't find any subtopics, return an empty JSON.
-Important notes:
-- The correct "end_page" must be the page number of the next topic or subtopic minus 1.
-- The final output must be valid JSON only, with no extra text or code blocks.
-Examples:
-1. Given this table of contents:
-1 Introduction – 2
-    Why choose Edexcel A Level Mathematics? - 2
-    Supporting you in planning and implementing this qualification - 3
-    Qualification at a glance - 5
-2 Subject content and assessment information – 7
-    Paper 1 and Paper 2: Pure Mathematics - 11
-    Paper 3: Statistics and Mechanics - 30
-    Assessment Objectives - 40
-3 Administration and general information – 42
-    Entries - 42
-    Access arrangements, reasonable adjustments, special consideration and malpractice - 42
-    Student recruitment and progression - 45
-Appendix 1: Formulae – 49
-Appendix 2: Notation – 53
-Appendix 3: Use of calculators – 59
-Appendix 4: Assessment Objectives – 60
-Appendix 5: The context for the development of this qualification – 62
-Appendix 6: Transferable skills – 64
-Appendix 7: Level 3 Extended Project qualification – 65
-Appendix 8: Codes – 67
-The correct output should be:
-{{
-    "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
-    "Paper 3: Statistics and Mechanics": [30, 42]
-}}
-2. Given this table of contents:
-Qualification at a glance – 1
-    Assessment Objectives and weightings - 4
-Knowledge, skills and understanding – 5
-    Theme 1: Introduction to markets and market failure - 5
-    Theme 2: The UK economy – performance and policies - 11
-    Theme 3: Business behaviour and the labour market - 21
-    Theme 4: A global perspective - 29
-Assessment – 39
-    Assessment summary - 39
-    Assessment objectives - 41
-    Assessment overview - 42
-    Breakdown of assessment objectives - 42
-        Synoptic assessment - 43
-        Discount code and performance tables - 43
-        Access arrangements, reasonable adjustments and special consideration - 44
-        Malpractice - 45
-        Equality Act 2010 and Pearson equality policy - 45
-        Synoptic assessment - 46
-        Awarding and reporting - 47
-Other information – 49
-    Student recruitment -49
-    Prior learning and other requirements -49
-    Progression - 49
-Appendix 1: Transferable skills – 53
-Appendix 2: Level 3 Extended Project qualification – 55
-Appendix 3: Quantitative skills – 59
-Appendix 4: Codes – 61
-Appendix 5: Index – 63
-The correct output should be:
-{{
-    "Theme 1: Introduction to markets and market failure": [5, 10],
-    "Theme 2: The UK economy – performance and policies": [11, 20],
-    "Theme 3: Business behaviour and the labour market": [21, 28],
-    "Theme 4: A global perspective": [29, 38]
-}}
-3. You might also see sections like:
-2.1 AS Unit 1 11
-2.2 AS Unit 2 18
-2.3 A2 Unit 3 24
-2.4 A2 Unit 4 31
-In that scenario, your output might look like:
-{{
-    "2.1 AS Unit 1": [11, 17],
-    "2.2 AS Unit 2": [18, 23],
-    "2.3 A2 Unit 3": [24, 30],
-    "2.4 A2 Unit 4": [31, 35]
-}}
-4. Another example might list subtopics:
-3.1 Overarching themes 11
-3.2 A: Proof 12
-3.3 B: Algebra and functions 13
-3.4 C: Coordinate geometry in the ( x , y ) plane 14
-3.5 D: Sequences and series 15
-3.6 E: Trigonometry 16
-3.7 F: Exponentials and logarithms 17
-3.8 G: Differentiation 18
-3.9 H: Integration 19
-3.10 I: Numerical methods 20
-3.11 J: Vectors 20
-3.12 K: Statistical sampling 21
-3.13 L: Data presentation and interpretation 21
-3.14 M: Probability 22
-3.15 N: Statistical distributions 23
-3.16 O: Statistical hypothesis testing 23
-3.17 P: Quantities and units in mechanics 24
-3.18 Q: Kinematics 24
-3.19 R: Forces and Newton’s laws 24
-3.20 S: Moments 25
-3.21 Use of data in statistics 26
-Here the correct output might look like:
-{{
-    "A: Proof": [12, 12],
-    "B: Algebra and functions": [13, 13],
-    ...
-}}
-Now, extract topics from this text:
-{first_pages_text}
-"""
-        global _GEMINI_CLIENT
-        if _GEMINI_CLIENT is None:
-            _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
-        client = _GEMINI_CLIENT
-        try:
-            response = client.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=[prompt],
-                config=types.GenerateContentConfig(temperature=0.0)
-            )
-            if not response or not response.text:
-                logger.warning("No text from LLM => returning empty subtopics.")
-                return {}
-            raw_json = response.text.strip()
-            cleaned = raw_json.replace("```json", "").replace("```", "")
-            try:
-                data = json.loads(cleaned)
-            except Exception as json_err:
-                logger.error(f"JSON parsing error: {json_err}")
-                return {}
-            final_dict = {}
-            found_sub_dict = None
-            for k, v in data.items():
-                if isinstance(v, dict):
-                    found_sub_dict = v
-                    break
-            if found_sub_dict is not None:
-                for subk, rng in found_sub_dict.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            else:
-                for subk, rng in data.items():
-                    if isinstance(rng, list) and len(rng) == 2:
-                        final_dict[subk] = rng
-            return final_dict
-        except Exception as e:
-            logger.error(f"Gemini subtopic extraction error: {e}")
-            return {}
-    def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
-        text_parts = []
-        try:
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    return ""
-                pdf_bytes = response.content
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            pages_to_read = min(num_pages, doc.page_count)
-            for i in range(pages_to_read):
-                raw_text = doc[i].get_text("raw")
-                text_parts.append(raw_text)
-            doc.close()
-        except Exception as e:
-            logger.error(f"Could not open PDF: {e}")
-        return "\n".join(text_parts)
-class MineruNoTextProcessor:
-    def __init__(self, output_folder: str, gemini_api_key: str):
-        self.output_folder = output_folder
-        os.makedirs(self.output_folder, exist_ok=True)
-        self.layout_model = "doclayout_yolo"
-        self.formula_enable = True
-        self.table_enable = False
-        self.language = "en"
-        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
-        self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
-        self.use_s3 = True
-        self.s3_writer = s3Writer(
-            ak=os.getenv("S3_ACCESS_KEY"),
-            sk=os.getenv("S3_SECRET_KEY"),
-            bucket="quextro-resources",
-            endpoint_url=os.getenv("S3_ENDPOINT")
-        )
-    def cleanup_gpu(self):
-        try:
-            gc.collect()
-            torch.cuda.empty_cache()
-            logger.info("GPU memory cleaned up.")
-        except Exception as e:
-            logger.error(f"Error during GPU cleanup: {e}")
-    def process(self, pdf_path: str) -> Dict[str, Any]:
-        logger.info(f"Processing PDF: {pdf_path}")
-        try:
-            # 1) Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
-            subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
-            logger.info(f"Gemini returned subtopics: {subtopics}")
-            if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
-                response = requests.get(pdf_path)
-                if response.status_code != 200:
-                    logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
-                    raise Exception(f"Failed to download PDF: {pdf_path}")
-                pdf_bytes = response.content
-                logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
-            else:
-                with open(pdf_path, "rb") as f:
-                    pdf_bytes = f.read()
-                logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            total_pages = doc.page_count
-            doc.close()
-            # 3) Decide which pages to process
-            final_pages = set()
-            if not subtopics:
-                # fallback
-                final_pages = set(range(total_pages))
-            else:
-                offset_candidates = []
-                for subname, rng in subtopics.items():
-                    start_p, _ = rng
-                    occs = find_all_occurrences(pdf_bytes, subname)
-                    for p in occs:
-                        candidate = p - (start_p - 1)
-                        if candidate > 0:
-                            offset_candidates.append(candidate)
-                if offset_candidates:
-                    try:
-                        from statistics import mode
-                        global_offset = mode(offset_candidates)
-                    except:
-                        from statistics import median
-                        global_offset = int(median(offset_candidates))
-                else:
-                    global_offset = 0
-                logger.info(f"Computed global offset: {global_offset}")
-                for subname, rng in subtopics.items():
-                    if not (isinstance(rng, list) and len(rng) == 2):
-                        continue
-                    start_p, end_p = rng
-                    if start_p > end_p:
-                        continue
-                    s0 = (start_p - 1) + global_offset
-                    e0 = (end_p - 1) + global_offset
-                    for pp in range(s0, e0 + 1):
-                        final_pages.add(pp)
-            if not final_pages:
-                final_pages = set(range(total_pages))
-            logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
-            subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
-            # 4) Analyze and produce markdown
-            dataset = PymuDocDataset(subset_pdf_bytes)
-            inference = doc_analyze(
-                dataset,
-                ocr=True,
-                lang=self.language,
-                layout_model=self.layout_model,
-                formula_enable=self.formula_enable,
-                table_enable=self.table_enable
-            )
-            writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
-            md_prefix = "/topic-extraction/"
-            pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
-            md_content = pipe_result.get_markdown(md_prefix)
-            final_markdown = writer.post_process(md_prefix, md_content)
-            subtopic_list = list(writer.extracted_subtopics.values())
-            out_path = os.path.join(self.output_folder, "final_subtopics.json")
-            with open(out_path, "w", encoding="utf-8") as f:
-                json.dump(subtopic_list, f, indent=2)
-            logger.info(f"Final subtopics JSON saved locally at {out_path}")
-            return {
-                "final_markdown": final_markdown,
-                "subtopics_extracted": subtopic_list
-            }
-        finally:
-            self.cleanup_gpu()
-if __name__ == "__main__":
-    input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
-    output_dir = "/home/user/app/we/we_ars"
-    gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
-    try:
-        processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
-        result = processor.process(input_pdf)
-        logger.info("Processing completed successfully.")
-        # The result includes final_markdown and subtopics_extracted
-    except Exception as e:
-        logger.error(f"Processing failed: {e}")

topic_extraction.py CHANGED Viewed

@@ -35,6 +35,7 @@ logger.addHandler(file_handler)
 _GEMINI_CLIENT = None
 def unify_whitespace(text: str) -> str:
     return re.sub(r"\s+", " ", text).strip()
@@ -66,6 +67,123 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
     doc.close()
     return subset_bytes
 class s3Writer:
     def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
         self.bucket = bucket
@@ -77,7 +195,6 @@ class s3Writer:
         )
     def write(self, path: str, data: bytes) -> None:
-        """Upload data to S3 using proper keyword arguments"""
         try:
             file_obj = BytesIO(data)
             self.client.upload_fileobj(
@@ -90,6 +207,13 @@ class s3Writer:
             logger.error(f"Failed to upload to S3: {str(e)}")
             raise
 def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
     arr = np.frombuffer(image_data, np.uint8)
     img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
@@ -107,27 +231,30 @@ def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -
     return image_data
 def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
     for attempt in range(max_retries + 1):
         try:
             prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
-The three-column 'table' image include such key features:
-    - Three columns header columns
-    - Headers like 'Topics', 'Content', 'Guidelines'
-    - Numbered sections (e.g., 8.4, 9.1)
-    - Educational curriculum-style structure
-The two-column 'table' image include such key features:
-    - Two columns header columns
-    - Headers like 'Subject content' and 'Additional information'
-    - Numbered sections (e.g., 2.1, 3.4)
-    - Educational curriculum-style structure
-    - Bullet description in 'Additional information'
 If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
 If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
-If the image does not show a table at all, respond with 'NO_TABLE'.
 Return only one of these exact labels.
 """
             global _GEMINI_CLIENT
             client = _GEMINI_CLIENT
             resp = client.models.generate_content(
                 model="gemini-2.0-flash",
                 contents=[
@@ -143,7 +270,7 @@ Return only one of these exact labels.
                         ]
                     }
                 ],
-                config=types.GenerateContentConfig(temperature=0.)
             )
             if resp and resp.text:
                 classification = resp.text.strip().upper()
@@ -151,6 +278,8 @@ Return only one of these exact labels.
                     return "THREE_COLUMN"
                 elif "TWO" in classification:
                     return "TWO_COLUMN"
             return "NO_TABLE"
         except Exception as e:
             logger.error(f"Gemini table classification error: {e}")
@@ -166,14 +295,158 @@ async def classify_image_async(image_data: bytes, api_key: str, max_retries: int
     preprocessed = preprocess_image(image_data)
     return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
 class S3ImageWriter(DataWriter):
     def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
         self.s3_writer = s3_writer
-        # Use the provided base_path (which can be based on the PDF file name)
         self.base_path = base_path if base_path.endswith("/") else base_path + "/"
         self.gemini_api_key = gemini_api_key
         self.descriptions = {}
         self._img_count = 0
     def write(self, path: str, data: bytes) -> None:
         self._img_count += 1
@@ -189,33 +462,45 @@ class S3ImageWriter(DataWriter):
     async def post_process_async(self, key: str, md_content: str) -> str:
         logger.info("Classifying images to detect tables.")
-        tasks = []
-        for p, info in self.descriptions.items():
-            tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
-        for p, task in tasks:
-            try:
-                classification = await task
-                self.descriptions[p]['table_classification'] = classification
-            except Exception as e:
-                logger.error(f"Table classification error: {e}")
                 self.descriptions[p]['table_classification'] = "NO_TABLE"
-        for p, info in self.descriptions.items():
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - two column table"
             elif cls == "THREE_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - three column table"
             else:
                 info['final_alt'] = "NO_TABLE image"
             md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
         md_content = await self._process_table_images_in_markdown(key, md_content)
-        final_lines = []
-        for line in md_content.split("\n"):
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
-                final_lines.append(line.strip())
         return "\n".join(final_lines)
     async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
@@ -223,6 +508,7 @@ class S3ImageWriter(DataWriter):
         matches = re.findall(pat, md_content, flags=re.IGNORECASE)
         if not matches:
             return md_content
         for (col_type, s3_key) in matches:
             logger.info(f"Processing table image: {s3_key}, columns={col_type}")
             img_data = None
@@ -233,9 +519,12 @@ class S3ImageWriter(DataWriter):
             if img_data is None:
                 logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
                 continue
             with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
                 temp_file.write(img_data)
                 temp_path = temp_file.name
             try:
                 if col_type.lower() == 'two':
                     extractor = TableExtractor(
@@ -252,141 +541,93 @@ class S3ImageWriter(DataWriter):
                         subtopic_threshold=0.2
                     )
                 row_boxes = extractor.process_image(temp_path)
-                snippet = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     for j, _ in enumerate(row):
-                        cell_unique_key = f"{self.base_path}cells/{os.path.basename(s3_key).split('.')[0]}_row{i}_col{j}.jpg"
-                        self.s3_writer.write(cell_unique_key, img_data)
-                        snippet.append(f"![Row {i} Col {j}]({cell_unique_key})")
-                new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
-                md_content = md_content.replace(old_line, new_snip)
-            except Exception as e:
-                logger.error(f"Error processing table image {s3_key}: {e}")
-            finally:
-                try:
-                    os.remove(temp_path)
-                except Exception:
-                    pass
-        return md_content
-    def post_process(self, key: str, md_content: str) -> str:
-        return asyncio.run(self.post_process_async(key, md_content))
-class LocalImageWriter(DataWriter):
-    def __init__(self, output_folder: str, gemini_api_key: str):
-        self.output_folder = output_folder
-        os.makedirs(self.output_folder, exist_ok=True)
-        self.descriptions = {}
-        self._img_count = 0
-        self.gemini_api_key = gemini_api_key
-        # New mapping to store extracted table cell image paths for testing.
-        self.extracted_tables = {}
-    def write(self, path: str, data: bytes) -> None:
-        self._img_count += 1
-        unique_id = f"img_{self._img_count}.jpg"
-        self.descriptions[path] = {
-            "data": data,
-            "relative_path": unique_id,
-            "table_classification": "NO_TABLE",
-            "final_alt": ""
-        }
-        # Also save the original image locally for testing.
-        image_path = os.path.join(self.output_folder, unique_id)
-        with open(image_path, "wb") as f:
-            f.write(data)
-    async def post_process_async(self, key: str, md_content: str) -> str:
-        logger.info("Classifying images to detect tables.")
-        tasks = []
-        for p, info in self.descriptions.items():
-            tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
-        for p, task in tasks:
-            try:
-                classification = await task
-                self.descriptions[p]['table_classification'] = classification
-            except Exception as e:
-                logger.error(f"Table classification error: {e}")
-                self.descriptions[p]['table_classification'] = "NO_TABLE"
-        for p, info in self.descriptions.items():
-            cls = info['table_classification']
-            if cls == "TWO_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - two column table"
-            elif cls == "THREE_COLUMN":
-                info['final_alt'] = "HAS TO BE PROCESSED - three column table"
-            else:
-                info['final_alt'] = "NO_TABLE image"
-            md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['relative_path']})")
-        md_content = self._process_table_images_in_markdown(md_content)
-        final_lines = []
-        for line in md_content.split("\n"):
-            if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
-                final_lines.append(line.strip())
-        return "\n".join(final_lines)
-    def _process_table_images_in_markdown(self, md_content: str) -> str:
-        pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
-        matches = re.findall(pat, md_content, flags=re.IGNORECASE)
-        if not matches:
-            return md_content
-        for (col_type, image_id) in matches:
-            logger.info(f"Processing table image => {image_id}, columns={col_type}")
-            temp_path = os.path.join(self.output_folder, image_id)
-            desc_item = None
-            for k, val in self.descriptions.items():
-                if val["relative_path"] == image_id:
-                    desc_item = val
-                    break
-            if not desc_item:
-                logger.warning(f"No matching image data for {image_id}, skipping extraction.")
-                continue
-            if not os.path.exists(temp_path):
-                with open(temp_path, "wb") as f:
-                    f.write(desc_item["data"])
-            try:
-                if col_type.lower() == 'two':
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=True,
-                        enable_subtopic_merge=True,
-                        subtopic_threshold=0.2
-                    )
-                else:
-                    extractor = TableExtractor(
-                        skip_header=True,
-                        merge_two_col_rows=False,
-                        enable_subtopic_merge=False,
-                        subtopic_threshold=0.2
-                    )
-                row_boxes = extractor.process_image(temp_path)
-                out_folder = temp_path + "_rows"
-                os.makedirs(out_folder, exist_ok=True)
-                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
-                # List all extracted cell images relative to the output folder.
-                extracted_cells = []
-                for root, dirs, files in os.walk(out_folder):
-                    for file in files:
-                        rel_path = os.path.relpath(os.path.join(root, file), self.output_folder)
-                        extracted_cells.append(rel_path)
-                # Save mapping for testing.
-                self.extracted_tables[image_id] = extracted_cells
                 snippet = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
-                    row_dir = os.path.join(out_folder, f"row_{i}")
                     for j, _ in enumerate(row):
-                        cell_file = f"col_{j}.jpg"
-                        cell_path = os.path.join(row_dir, cell_file)
-                        relp = os.path.relpath(cell_path, self.output_folder)
-                        snippet.append(f"![Row {i} Col {j}]({relp})")
                 new_snip = "\n".join(snippet)
-                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
                 md_content = md_content.replace(old_line, new_snip)
             except Exception as e:
-                logger.error(f"Error processing table image {image_id}: {e}")
             finally:
-                if os.path.exists(temp_path):
-                    os.remove(temp_path)
         return md_content
     def post_process(self, key: str, md_content: str) -> str:
@@ -492,6 +733,15 @@ In that scenario, your output might look like:
     "2.3 A2 Unit 3": [24, 30],
     "2.4 A2 Unit 4": [31, 35]
 }}
 4. Another example might list subtopics:
 3.1 Overarching themes 11
 3.2 A: Proof 12
@@ -585,31 +835,24 @@ Now, extract topics from this text:
         return "\n".join(text_parts)
 class MineruNoTextProcessor:
-    def __init__(self, output_folder: str, gemini_api_key: str = None):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.layout_model = "doclayout_yolo"
         self.formula_enable = True
         self.table_enable = False
         self.language = "en"
-        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
-        # For testing via __main__, force local saving.
-        if __name__ == "__main__":
-            logger.info("Running in test mode: using local image writer.")
-            self.use_s3 = False
-        else:
-            if (os.getenv("S3_ACCESS_KEY") and os.getenv("S3_SECRET_KEY") and
-                os.getenv("S3_BUCKET_NAME") and os.getenv("S3_ENDPOINT")):
-                self.use_s3 = True
-                self.s3_writer = s3Writer(
-                    ak=os.getenv("S3_ACCESS_KEY"),
-                    sk=os.getenv("S3_SECRET_KEY"),
-                    bucket=os.getenv("S3_BUCKET_NAME"),
-                    endpoint_url=os.getenv("S3_ENDPOINT")
-                )
-            else:
-                self.use_s3 = False
     def cleanup_gpu(self):
         try:
@@ -622,8 +865,10 @@ class MineruNoTextProcessor:
     def process(self, pdf_path: str) -> Dict[str, Any]:
         logger.info(f"Processing PDF: {pdf_path}")
         try:
             subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
             logger.info(f"Gemini returned subtopics: {subtopics}")
             if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
                 response = requests.get(pdf_path)
                 if response.status_code != 200:
@@ -635,46 +880,54 @@ class MineruNoTextProcessor:
                 with open(pdf_path, "rb") as f:
                     pdf_bytes = f.read()
                 logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
             final_pages = set()
             if not subtopics:
-                logger.warning("No subtopics found. Processing entire PDF as fallback.")
                 final_pages = set(range(total_pages))
             else:
                 for subname, rng in subtopics.items():
                     if not (isinstance(rng, list) and len(rng) == 2):
-                        logger.warning(f"Skipping subtopic '{subname}' => invalid range {rng}")
                         continue
                     start_p, end_p = rng
                     if start_p > end_p:
-                        logger.warning(f"Skipping subtopic '{subname}' => start > end {rng}")
                         continue
-                    occs = find_all_occurrences(pdf_bytes, subname)
-                    logger.info(f"Occurrences of subtopic '{subname}': {occs}")
-                    doc_start_0 = start_p - 1
-                    chosen_page = None
-                    for p in occs:
-                        if p >= doc_start_0:
-                            chosen_page = p
-                            break
-                    if chosen_page is None:
-                        chosen_page = occs[-1] if occs else 0
-                        logger.warning(f"No suitable occurrence for '{subname}'. Using page {chosen_page}.")
-                    raw_offset = chosen_page - doc_start_0
-                    offset = max(0, raw_offset)
-                    s0 = (start_p - 1) + offset
-                    e0 = (end_p - 1) + offset
-                    s0 = max(0, min(total_pages - 1, s0))
-                    e0 = max(0, min(total_pages - 1, e0))
                     for pp in range(s0, e0 + 1):
                         final_pages.add(pp)
             if not final_pages:
-                logger.warning("No valid pages after offset. Processing entire PDF.")
                 final_pages = set(range(total_pages))
             logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
             subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
@@ -684,49 +937,36 @@ class MineruNoTextProcessor:
                 formula_enable=self.formula_enable,
                 table_enable=self.table_enable
             )
-            logger.info("doc_analyze complete. Extracting images.")
-            key = os.path.splitext(os.path.basename(pdf_path))[0]
-            if self.use_s3:
-                writer = S3ImageWriter(self.s3_writer, f"{key}/", self.gemini_api_key)
-                md_prefix = f"{key}/"
-            else:
-                writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
-                md_prefix = "local-unique-prefix/"
             pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown(md_prefix)
             final_markdown = writer.post_process(md_prefix, md_content)
-            output_json = {
-                "subtopics": subtopics
-            }
-            if not self.use_s3 and isinstance(writer, LocalImageWriter):
-                local_images = {k: v["relative_path"] for k, v in writer.descriptions.items()}
-                tables_extracted = writer.extracted_tables
-                output_json["local_images"] = local_images
-                output_json["tables_extracted"] = tables_extracted
-            # Save output in JSON format.
-            out_json = json.dumps(output_json, indent=2)
-            # Save JSON locally.
-            out_path = os.path.join(self.output_folder, "final_output.json")
             with open(out_path, "w", encoding="utf-8") as f:
-                f.write(out_json)
-            logger.info(f"Final JSON saved locally at {out_path}")
-            # Also save a local copy for testing.
-            local_md_path = os.path.join(self.output_folder, "final_output_local.json")
-            with open(local_md_path, "w", encoding="utf-8") as f:
-                f.write(out_json)
-            logger.info(f"Final JSON saved locally at {local_md_path}")
-            return output_json
         finally:
             self.cleanup_gpu()
 if __name__ == "__main__":
-    input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
-    output_dir = "/home/user/app/wje"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
-        result_json = processor.process(input_pdf)
         logger.info("Processing completed successfully.")
     except Exception as e:
         logger.error(f"Processing failed: {e}")

 _GEMINI_CLIENT = None
+#helper functions, also global
 def unify_whitespace(text: str) -> str:
     return re.sub(r"\s+", " ", text).strip()
     doc.close()
     return subset_bytes
+def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
+    """
+    Clean up a topic title:
+    - Remove any trailing "continued".
+    - If the title does not start with a number but children provide a consistent numeric prefix,
+      then prepend that prefix.
+    """
+    title = raw_title.strip()
+    # Remove trailing "continued"
+    title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
+    # If title already starts with a number, use it as is.
+    if re.match(r"^\d+", title):
+        return title
+    # Otherwise, try to deduce a numeric prefix from the children.
+    prefixes = []
+    for child in children_subtopics:
+        child_title = child.get("title", "").strip()
+        m = re.match(r"^(\d+)\.", child_title)
+        if m:
+            prefixes.append(m.group(1))
+    if prefixes:
+        # If all numeric prefixes in children are the same, use that prefix.
+        if all(p == prefixes[0] for p in prefixes):
+            # If title is non-empty, prepend the number; otherwise, use a fallback.
+            if title:
+                title = f"{prefixes[0]} {title}"
+            else:
+                title = f"{prefixes[0]} Topic"
+    # Optionally, handle known broken titles explicitly.
+    if title.lower() in {"gonometry"}:
+        # For example, if children indicate "5.X", set to "5 Trigonometry"
+        if prefixes and prefixes[0] == "5":
+            title = "5 Trigonometry"
+    return title
+def merge_topics(subtopic_list: list) -> list:
+    """
+    Merge topics with an enhanced logic:
+    1. Clean up each topic's title using unify_topic_name.
+    2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
+    3. Reassign children: for each child whose title (e.g. "3.1") does not match its current parent's numeric prefix,
+       move it to the parent with the matching prefix if available.
+    4. Remove duplicate children by merging contents.
+    5. Sort parent topics and each parent's children by their numeric ordering.
+    """
+    # First, merge topics by parent's numeric prefix.
+    merged = {}
+    for topic_obj in subtopic_list:
+        raw_title = topic_obj.get("title", "")
+        children = topic_obj.get("children", [])
+        contents = topic_obj.get("contents", [])
+        new_title = unify_topic_name(raw_title, children)
+        # Extract parent's numeric prefix, if present.
+        m = re.match(r"^(\d+)", new_title)
+        parent_prefix = m.group(1) if m else None
+        key = parent_prefix if parent_prefix is not None else new_title
+        if key not in merged:
+            merged[key] = {
+                "title": new_title,
+                "contents": list(contents),
+                "children": list(children),
+            }
+        else:
+            # Merge contents and children; choose the longer title.
+            if len(new_title) > len(merged[key]["title"]):
+                merged[key]["title"] = new_title
+            merged[key]["contents"].extend(contents)
+            merged[key]["children"].extend(children)
+    # Build a lookup of merged topics by their numeric prefix.
+    parent_lookup = merged  # keys are numeric prefixes or the full title for non-numeric ones.
+    # Reassign children to the correct parent based on their numeric prefix.
+    for key, topic in merged.items():
+        new_children = []
+        for child in topic["children"]:
+            child_title = child.get("title", "").strip()
+            m_child = re.match(r"^(\d+)\.", child_title)
+            if m_child:
+                child_prefix = m_child.group(1)
+                if key != child_prefix and child_prefix in parent_lookup:
+                    # Reassign this child to the proper parent.
+                    parent_lookup[child_prefix]["children"].append(child)
+                    continue
+            new_children.append(child)
+        topic["children"] = new_children
+    # Remove duplicate children by merging their contents.
+    for topic in merged.values():
+        child_map = {}
+        for child in topic["children"]:
+            ctitle = child.get("title", "").strip()
+            if ctitle not in child_map:
+                child_map[ctitle] = child
+            else:
+                child_map[ctitle]["contents"].extend(child.get("contents", []))
+                child_map[ctitle]["children"].extend(child.get("children", []))
+        topic["children"] = list(child_map.values())
+        # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
+        def parse_subtopic_num(subtitle):
+            digits = re.findall(r"\d+", subtitle)
+            return tuple(int(d) for d in digits) if digits else (9999,)
+        topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
+    # Convert merged topics to a sorted list.
+    def parse_parent_num(topic):
+        m = re.match(r"^(\d+)", topic.get("title", ""))
+        return int(m.group(1)) if m else 9999
+    final_list = list(merged.values())
+    final_list.sort(key=lambda topic: parse_parent_num(topic))
+    return final_list
 class s3Writer:
     def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
         self.bucket = bucket
         )
     def write(self, path: str, data: bytes) -> None:
         try:
             file_obj = BytesIO(data)
             self.client.upload_fileobj(
             logger.error(f"Failed to upload to S3: {str(e)}")
             raise
+    def delete(self, path: str) -> None:
+        try:
+            self.client.delete_object(Bucket=self.bucket, Key=path)
+        except Exception as e:
+            logger.error(f"Failed to delete from S3: {str(e)}")
+            raise
 def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
     arr = np.frombuffer(image_data, np.uint8)
     img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
     return image_data
 def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
+    """
+    Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
+    """
     for attempt in range(max_retries + 1):
         try:
             prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
+The three-column 'table' image includes such key features:
+    - Three columns header
+    - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
+    - Possibly sections (e.g. 8.4, 9.1)
+The two-column 'table' image includes such key features:
+    - Two columns
+    - Headers like 'Subject content', 'Additional information'
+    - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
 If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
 If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
+If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
 Return only one of these exact labels.
 """
             global _GEMINI_CLIENT
+            if _GEMINI_CLIENT is None:
+                _GEMINI_CLIENT = genai.Client(api_key=api_key)
             client = _GEMINI_CLIENT
             resp = client.models.generate_content(
                 model="gemini-2.0-flash",
                 contents=[
                         ]
                     }
                 ],
+                config=types.GenerateContentConfig(temperature=0.0)
             )
             if resp and resp.text:
                 classification = resp.text.strip().upper()
                     return "THREE_COLUMN"
                 elif "TWO" in classification:
                     return "TWO_COLUMN"
+                elif "EMPTY" in classification:
+                    return "EMPTY_IMAGE"
             return "NO_TABLE"
         except Exception as e:
             logger.error(f"Gemini table classification error: {e}")
     preprocessed = preprocess_image(image_data)
     return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
+def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
+    for attempt in range(max_retries + 1):
+        try:
+            prompt = """
+You are given an image from an educational curriculum specification. The image may contain:
+1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
+2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6", or "3.4".
+3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
+4) Possibly no relevant text at all.
+Your task is to extract:
+- **"title"**: A recognized main topic or heading text.
+- **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4"), as an array of strings.
+Follow these rules:
+(1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued", (remove the word "continued") then:
+    - Put that text (without the word "continued") in "title". (e.g. "2 Algebra and functions")
+    - "subtopics" should be an empty array, unless you also see smaller subtopic numbers.
+(2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4", then:
+    - Collect those exact strings in the JSON key "subtopics" (an array of strings).
+    - "title" in this case should be an empty string if you only detect subtopics.
+      (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
+(3) If no main topic or subtopic is detected but the text appears to be a heading (e.g. "Scarcity, choice and opportunity cost"), return:
+    {{
+      "title": "",
+      "subtopics": []
+    }}
+(4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
+    - Use the **left column text** as "title".
+    - "subtopics" remains empty.
+    Example:
+    If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
+    {
+      "title": "Scarcity, choice and opportunity cost",
+      "subtopics": []
+    }
+(5) **If there is a character + digit pattern** in the left column for a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
+    - Put that label text into "title" (e.g. "G2").
+    - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
+(6) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
+    {
+      "title": "...",
+      "subtopics": [...]
+    }
+(7) If the image is blank or truncated, defined as:
+    - Contains no words at all (e.g. a blank white or black image)
+    - Contains only a truncated snippet of words such as "Topics", "What students need to learn" with blue background
+    - Contains a truncated snippet with words like "Topics", "What students need to learn", "Content" with gray background (RGB (166,166,166) or (180,180,180)) then return:
+    {{
+        "title": "EMPTY_IMAGE",
+        "subtopics": []
+    }}
+**Examples**:
+- If the image text is `"2 Algebra and functions continued"`, return:
+  {
+    "title": "2 Algebra and functions",
+    "subtopics": []
+  }
+- If the image text is `"2.5 Solve linear and quadratic inequalities ..."`, return:
+  {
+    "title": "",
+    "subtopics": ["2.5"]
+  }
+- If the image text is `"Scarcity, choice and opportunity cost"` (with no numeric patterns at all), return:
+  {
+    "title": "Scarcity, choice and opportunity cost",
+    "subtopics": []
+  }
+- If the left column says `"G2"` and the right column has details, but no subtopic numbers, return:
+  {
+    "title": "G2",
+    "subtopics": []
+  }
+- If you cannot recognize any text matching these patterns, or if nothing is found, return:
+  {
+    "title": "",
+    "subtopics": []
+  }
+"""
+            global _GEMINI_CLIENT
+            if _GEMINI_CLIENT is None:
+                _GEMINI_CLIENT = genai.Client(api_key=api_key)
+            client = _GEMINI_CLIENT
+            resp = client.models.generate_content(
+                model="gemini-2.0-flash",
+                contents=[
+                    {
+                        "parts": [
+                            {"text": prompt},
+                            {
+                                "inline_data": {
+                                    "mime_type": "image/jpeg",
+                                    "data": base64.b64encode(image_data).decode("utf-8")
+                                }
+                            }
+                        ]
+                    }
+                ],
+                config=types.GenerateContentConfig(temperature=0.0)
+            )
+            if not resp or not resp.text:
+                logger.warning("Gemini returned an empty response for subtopic extraction.")
+                return {"title": "", "subtopics": []}
+            raw = resp.text.strip()
+            # Remove any markdown fences if present
+            raw = raw.replace("```json", "").replace("```", "").strip()
+            data = json.loads(raw)
+            title = data.get("title", "")
+            subtopics = data.get("subtopics", [])
+            if title.upper() == "EMPTY_IMAGE":
+                return {"title": "EMPTY_IMAGE", "subtopics": []}
+            if not isinstance(subtopics, list):
+                subtopics = []
+            return {"title": title, "subtopics": subtopics}
+        except Exception as e:
+            logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
+            if attempt < max_retries:
+                time.sleep(0.5)
+            else:
+                return {"title": "", "subtopics": []}
+    return {"title": "", "subtopics": []}
 class S3ImageWriter(DataWriter):
     def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
         self.s3_writer = s3_writer
         self.base_path = base_path if base_path.endswith("/") else base_path + "/"
         self.gemini_api_key = gemini_api_key
         self.descriptions = {}
         self._img_count = 0
+        self.extracted_tables = {}
+        self.extracted_subtopics = {}
     def write(self, path: str, data: bytes) -> None:
         self._img_count += 1
     async def post_process_async(self, key: str, md_content: str) -> str:
         logger.info("Classifying images to detect tables.")
+        tasks = {
+            p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
+            for p, info in self.descriptions.items()
+        }
+        results = await asyncio.gather(*tasks.values(), return_exceptions=True)
+        for p, result in zip(list(self.descriptions.keys()), results):
+            if isinstance(result, Exception):
+                logger.error(f"Table classification error for {p}: {result}")
                 self.descriptions[p]['table_classification'] = "NO_TABLE"
+            else:
+                self.descriptions[p]['table_classification'] = result
+        # Process each image description.
+        for p, info in list(self.descriptions.items()):
             cls = info['table_classification']
             if cls == "TWO_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - two column table"
             elif cls == "THREE_COLUMN":
                 info['final_alt'] = "HAS TO BE PROCESSED - three column table"
+            elif cls == "EMPTY_IMAGE":
+                # Remove markdown reference, delete from descriptions and S3.
+                md_content = md_content.replace(f"![]({key}{p})", "")
+                try:
+                    self.s3_writer.delete(info['s3_path'])
+                except Exception as e:
+                    logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
+                del self.descriptions[p]
+                continue
             else:
                 info['final_alt'] = "NO_TABLE image"
             md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
         md_content = await self._process_table_images_in_markdown(key, md_content)
+        # Filter final lines to keep only lines with images.
+        final_lines = [
+            line.strip() for line in md_content.split("\n")
+            if re.match(r"^\!\[.*\]\(.*\)", line.strip())
+        ]
         return "\n".join(final_lines)
     async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
         matches = re.findall(pat, md_content, flags=re.IGNORECASE)
         if not matches:
             return md_content
         for (col_type, s3_key) in matches:
             logger.info(f"Processing table image: {s3_key}, columns={col_type}")
             img_data = None
             if img_data is None:
                 logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
                 continue
+            # Write temporary file for processing.
             with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
                 temp_file.write(img_data)
                 temp_path = temp_file.name
             try:
                 if col_type.lower() == 'two':
                     extractor = TableExtractor(
                         subtopic_threshold=0.2
                     )
                 row_boxes = extractor.process_image(temp_path)
+                # logger.info(f"Extracted {len(row_boxes)} rows from {temp_path}")
+                # for i, row in enumerate(row_boxes):
+                #     logger.info(f"Row {i} has {len(row)} cells")
+                out_folder = temp_path + "_rows"
+                os.makedirs(out_folder, exist_ok=True)
+                # out_folder = os.path.join(os.path.dirname(temp_path), os.path.basename(temp_path) + "_rows")
+                # os.makedirs(out_folder, exist_ok=True)
+                extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
+                #just to print structure how cells are saved and named for each table image
+                # logger.info(f"Files in {out_folder}:")
+                # for root, dirs, files in os.walk(out_folder):
+                #     logger.info(f"{root}: {files}")
+                recognized_main_topic = ""
+                main_topic_image_key = None
+                recognized_subtopics = []
+                # Loop over each cell image.
                 for i, row in enumerate(row_boxes):
+                    row_dir = os.path.join(out_folder, f"row_{i}")
                     for j, _ in enumerate(row):
+                        cell_path = os.path.join(row_dir, f"col_{j}.png")
+                        if not os.path.isfile(cell_path):
+                            alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
+                            if os.path.isfile(alternative_path):
+                                cell_path = alternative_path
+                            else:
+                                logger.warning(f"Cell image not found: {cell_path}")
+                                continue
+                        with open(cell_path, "rb") as cf:
+                            cell_image_data = cf.read()
+                        cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
+                        self.s3_writer.write(cell_key, cell_image_data)
+                        #extract subtopic info from the cell image.
+                        info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
+                        # Check if the image is empty.
+                        if info.get("title", "").upper() == "EMPTY_IMAGE":
+                            try:
+                                self.s3_writer.delete(cell_key)
+                                logger.info(f"Deleted empty cell image from S3: {cell_key}")
+                            except Exception as e:
+                                logger.error(f"Error deleting empty cell image {cell_key}: {e}")
+                            continue  # Skip processing this cell further
+                        if info["title"] and not recognized_main_topic:
+                            recognized_main_topic = info["title"]
+                            main_topic_image_key = cell_key
+                        for st in info["subtopics"]:
+                            recognized_subtopics.append({
+                                "title": st,
+                                "contents": [{"type": "image", "key": cell_key}],
+                                "children": []
+                            })
+                final_json = {
+                    "title": recognized_main_topic,
+                    "contents": [],
+                    "children": recognized_subtopics
+                }
+                if main_topic_image_key:
+                    final_json["contents"].append({"type": "image", "key": main_topic_image_key})
+                # Save the final JSON.
+                self.extracted_subtopics[s3_key] = final_json
+                # Optionally, create a snippet to replace the markdown line.
                 snippet = ["**Extracted table cells:**"]
                 for i, row in enumerate(row_boxes):
                     for j, _ in enumerate(row):
+                        snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
                 new_snip = "\n".join(snippet)
+                old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
                 md_content = md_content.replace(old_line, new_snip)
             except Exception as e:
+                logger.error(f"Error processing table image {s3_key}: {e}")
             finally:
+                os.remove(temp_path)
         return md_content
     def post_process(self, key: str, md_content: str) -> str:
     "2.3 A2 Unit 3": [24, 30],
     "2.4 A2 Unit 4": [31, 35]
 }}
+or
+2.1 AS units 6
+2.2 AS units 23
+In that scenario, your output might look like:
+{{
+    "2.1 AS Unit 1": [6, 2],
+    "2.2 AS Unit 2": [23, 43]
+}}
 4. Another example might list subtopics:
 3.1 Overarching themes 11
 3.2 A: Proof 12
         return "\n".join(text_parts)
 class MineruNoTextProcessor:
+    def __init__(self, output_folder: str, gemini_api_key: str):
         self.output_folder = output_folder
         os.makedirs(self.output_folder, exist_ok=True)
         self.layout_model = "doclayout_yolo"
         self.formula_enable = True
         self.table_enable = False
         self.language = "en"
+        self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
         self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
+        self.use_s3 = True
+        self.s3_writer = s3Writer(
+            ak=os.getenv("S3_ACCESS_KEY"),
+            sk=os.getenv("S3_SECRET_KEY"),
+            bucket="quextro-resources",
+            endpoint_url=os.getenv("S3_ENDPOINT")
+        )
     def cleanup_gpu(self):
         try:
     def process(self, pdf_path: str) -> Dict[str, Any]:
         logger.info(f"Processing PDF: {pdf_path}")
         try:
+            # Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
             subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
             logger.info(f"Gemini returned subtopics: {subtopics}")
             if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
                 response = requests.get(pdf_path)
                 if response.status_code != 200:
                 with open(pdf_path, "rb") as f:
                     pdf_bytes = f.read()
                 logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             total_pages = doc.page_count
             doc.close()
+            # Decide which pages to process
             final_pages = set()
             if not subtopics:
+                # fallback
                 final_pages = set(range(total_pages))
             else:
+                offset_candidates = []
+                for subname, rng in subtopics.items():
+                    start_p, _ = rng
+                    occs = find_all_occurrences(pdf_bytes, subname)
+                    for p in occs:
+                        candidate = p - (start_p - 1)
+                        if candidate > 0:
+                            offset_candidates.append(candidate)
+                if offset_candidates:
+                    try:
+                        from statistics import mode
+                        global_offset = mode(offset_candidates)
+                    except:
+                        from statistics import median
+                        global_offset = int(median(offset_candidates))
+                else:
+                    global_offset = 0
+                logger.info(f"Computed global offset: {global_offset}")
                 for subname, rng in subtopics.items():
                     if not (isinstance(rng, list) and len(rng) == 2):
                         continue
                     start_p, end_p = rng
                     if start_p > end_p:
                         continue
+                    s0 = (start_p - 1) + global_offset
+                    e0 = (end_p - 1) + global_offset
                     for pp in range(s0, e0 + 1):
                         final_pages.add(pp)
             if not final_pages:
                 final_pages = set(range(total_pages))
             logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
             subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
+            # 4) Analyze and produce markdown
             dataset = PymuDocDataset(subset_pdf_bytes)
             inference = doc_analyze(
                 dataset,
                 formula_enable=self.formula_enable,
                 table_enable=self.table_enable
             )
+            #S3
+            writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
+            md_prefix = "/topic-extraction/"
             pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
             md_content = pipe_result.get_markdown(md_prefix)
             final_markdown = writer.post_process(md_prefix, md_content)
+            subtopic_list = list(writer.extracted_subtopics.values())
+            subtopic_list = merge_topics(subtopic_list)
+            out_path = os.path.join(self.output_folder, "_subtopics.json")
             with open(out_path, "w", encoding="utf-8") as f:
+                json.dump(subtopic_list, f, indent=2)
+            logger.info(f"Final subtopics JSON saved locally at {out_path}")
+            return {
+                "final_markdown": final_markdown,
+                "subtopics_extracted": subtopic_list
+            }
         finally:
             self.cleanup_gpu()
 if __name__ == "__main__":
+    input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
+    output_dir = "/home/user/app/pearson_json"
     gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
     try:
         processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
+        result = processor.process(input_pdf)
         logger.info("Processing completed successfully.")
     except Exception as e:
         logger.error(f"Processing failed: {e}")

topic_extraction_ars.log DELETED Viewed

@@ -1,746 +0,0 @@
-2025-03-03 15:45:38,171 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
-2025-03-03 15:45:38,974 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
-2025-03-03 15:45:38,975 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
-2025-03-03 15:45:39,261 [INFO] __main__ - Computed global offset: 4
-2025-03-03 15:45:39,261 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
-2025-03-03 15:46:34,912 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
-2025-03-03 15:46:36,964 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
-2025-03-03 15:46:37,539 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
-2025-03-03 15:46:38,161 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
-2025-03-03 15:46:38,703 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
-2025-03-03 15:46:39,330 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
-2025-03-03 15:46:39,805 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
-2025-03-03 15:46:40,281 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
-2025-03-03 15:46:40,751 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
-2025-03-03 15:46:41,336 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
-2025-03-03 15:46:41,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
-2025-03-03 15:46:42,431 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
-2025-03-03 15:46:42,903 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
-2025-03-03 15:46:43,490 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
-2025-03-03 15:46:43,962 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
-2025-03-03 15:46:44,566 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
-2025-03-03 15:46:45,155 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
-2025-03-03 15:46:45,448 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
-2025-03-03 15:46:45,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
-2025-03-03 15:46:46,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
-2025-03-03 15:46:47,081 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
-2025-03-03 15:46:47,652 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
-2025-03-03 15:46:48,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
-2025-03-03 15:46:48,593 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
-2025-03-03 15:46:49,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
-2025-03-03 15:46:49,644 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
-2025-03-03 15:46:50,274 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
-2025-03-03 15:46:50,891 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
-2025-03-03 15:46:51,327 [INFO] __main__ - Classifying images to detect tables.
-2025-03-03 15:46:55,176 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
-2025-03-03 15:46:58,654 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
-2025-03-03 15:46:58,952 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
-2025-03-03 15:46:59,179 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
-2025-03-03 15:46:59,433 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
-2025-03-03 15:46:59,434 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
-2025-03-03 15:47:02,885 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
-2025-03-03 15:47:03,187 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
-2025-03-03 15:47:03,419 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
-2025-03-03 15:47:03,657 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
-2025-03-03 15:47:03,872 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
-2025-03-03 15:47:03,873 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
-2025-03-03 15:47:07,421 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
-2025-03-03 15:47:07,712 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
-2025-03-03 15:47:07,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r1_c0.jpg
-2025-03-03 15:47:07,918 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
-2025-03-03 15:47:11,395 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
-2025-03-03 15:47:11,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
-2025-03-03 15:47:11,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
-2025-03-03 15:47:12,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
-2025-03-03 15:47:12,138 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
-2025-03-03 15:47:15,853 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
-2025-03-03 15:47:16,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
-2025-03-03 15:47:16,379 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
-2025-03-03 15:47:16,611 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
-2025-03-03 15:47:16,850 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
-2025-03-03 15:47:16,850 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
-2025-03-03 15:47:20,810 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
-2025-03-03 15:47:21,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
-2025-03-03 15:47:21,322 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
-2025-03-03 15:47:21,549 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
-2025-03-03 15:47:21,549 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
-2025-03-03 15:47:25,075 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
-2025-03-03 15:47:25,405 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
-2025-03-03 15:47:25,599 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
-2025-03-03 15:47:25,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
-2025-03-03 15:47:26,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c1.jpg
-2025-03-03 15:47:26,054 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
-2025-03-03 15:47:29,662 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
-2025-03-03 15:47:29,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
-2025-03-03 15:47:30,160 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c2.jpg
-2025-03-03 15:47:30,354 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
-2025-03-03 15:47:30,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
-2025-03-03 15:47:30,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c2.jpg
-2025-03-03 15:47:31,028 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
-2025-03-03 15:47:31,232 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c1.jpg
-2025-03-03 15:47:31,461 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
-2025-03-03 15:47:31,654 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c1.jpg
-2025-03-03 15:47:31,912 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
-2025-03-03 15:47:32,139 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c1.jpg
-2025-03-03 15:47:32,345 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c0.jpg
-2025-03-03 15:47:32,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c1.jpg
-2025-03-03 15:47:32,587 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
-2025-03-03 15:47:36,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
-2025-03-03 15:47:36,676 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c1.jpg
-2025-03-03 15:47:36,893 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c2.jpg
-2025-03-03 15:47:37,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
-2025-03-03 15:47:37,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c1.jpg
-2025-03-03 15:47:37,565 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c0.jpg
-2025-03-03 15:47:37,760 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c1.jpg
-2025-03-03 15:47:38,012 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c0.jpg
-2025-03-03 15:47:38,226 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c1.jpg
-2025-03-03 15:47:38,226 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
-2025-03-03 15:47:42,402 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
-2025-03-03 15:47:42,675 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
-2025-03-03 15:47:42,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
-2025-03-03 15:47:43,133 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
-2025-03-03 15:47:43,355 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
-2025-03-03 15:47:43,355 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
-2025-03-03 15:47:48,037 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
-2025-03-03 15:47:48,332 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
-2025-03-03 15:47:48,540 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
-2025-03-03 15:47:48,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
-2025-03-03 15:47:49,037 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
-2025-03-03 15:47:49,264 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r5_c0.jpg
-2025-03-03 15:47:49,264 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
-2025-03-03 15:47:53,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
-2025-03-03 15:47:53,598 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
-2025-03-03 15:47:53,819 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
-2025-03-03 15:47:54,034 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
-2025-03-03 15:47:54,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
-2025-03-03 15:47:54,474 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
-2025-03-03 15:47:54,474 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
-2025-03-03 15:47:57,779 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
-2025-03-03 15:47:58,103 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
-2025-03-03 15:47:58,326 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
-2025-03-03 15:47:58,545 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
-2025-03-03 15:47:58,738 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
-2025-03-03 15:47:58,994 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
-2025-03-03 15:47:58,994 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
-2025-03-03 15:48:03,866 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
-2025-03-03 15:48:04,164 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c1.jpg
-2025-03-03 15:48:04,382 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
-2025-03-03 15:48:04,605 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c1.jpg
-2025-03-03 15:48:04,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r2_c0.jpg
-2025-03-03 15:48:05,032 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r3_c0.jpg
-2025-03-03 15:48:05,247 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c0.jpg
-2025-03-03 15:48:05,493 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c1.jpg
-2025-03-03 15:48:05,710 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r5_c0.jpg
-2025-03-03 15:48:05,711 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
-2025-03-03 15:48:09,411 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
-2025-03-03 15:48:09,698 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
-2025-03-03 15:48:09,923 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
-2025-03-03 15:48:10,113 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c1.jpg
-2025-03-03 15:48:10,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
-2025-03-03 15:48:10,587 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
-2025-03-03 15:48:10,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r4_c0.jpg
-2025-03-03 15:48:10,800 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
-2025-03-03 15:48:14,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
-2025-03-03 15:48:14,969 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
-2025-03-03 15:48:15,207 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
-2025-03-03 15:48:15,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c1.jpg
-2025-03-03 15:48:15,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
-2025-03-03 15:48:15,893 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
-2025-03-03 15:48:16,111 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c1.jpg
-2025-03-03 15:48:16,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
-2025-03-03 15:48:17,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r5_c0.jpg
-2025-03-03 15:48:17,176 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
-2025-03-03 15:48:20,954 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
-2025-03-03 15:48:21,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
-2025-03-03 15:48:21,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
-2025-03-03 15:48:21,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
-2025-03-03 15:48:21,832 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
-2025-03-03 15:48:22,056 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
-2025-03-03 15:48:22,261 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
-2025-03-03 15:48:22,481 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r5_c0.jpg
-2025-03-03 15:48:22,482 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
-2025-03-03 15:48:23,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
-2025-03-03 15:48:23,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
-2025-03-03 15:48:24,035 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
-2025-03-03 15:48:24,219 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
-2025-03-03 15:48:24,219 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
-2025-03-03 15:48:27,206 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
-2025-03-03 15:48:27,482 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
-2025-03-03 15:48:27,693 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
-2025-03-03 15:48:27,924 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
-2025-03-03 15:48:28,131 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
-2025-03-03 15:48:28,337 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
-2025-03-03 15:48:28,338 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
-2025-03-03 15:48:32,733 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
-2025-03-03 15:48:32,995 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c1.jpg
-2025-03-03 15:48:33,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c0.jpg
-2025-03-03 15:48:33,449 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c1.jpg
-2025-03-03 15:48:33,449 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
-2025-03-03 15:48:37,495 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
-2025-03-03 15:48:37,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
-2025-03-03 15:48:38,060 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
-2025-03-03 15:48:38,267 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
-2025-03-03 15:48:38,267 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
-2025-03-03 15:48:42,539 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
-2025-03-03 15:48:42,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
-2025-03-03 15:48:43,064 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
-2025-03-03 15:48:43,280 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c1.jpg
-2025-03-03 15:48:43,487 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c0.jpg
-2025-03-03 15:48:43,716 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c1.jpg
-2025-03-03 15:48:43,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r3_c0.jpg
-2025-03-03 15:48:43,918 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
-2025-03-03 15:48:47,600 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
-2025-03-03 15:48:47,900 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
-2025-03-03 15:48:48,125 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
-2025-03-03 15:48:48,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
-2025-03-03 15:48:48,343 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
-2025-03-03 15:48:52,065 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
-2025-03-03 15:48:52,376 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
-2025-03-03 15:48:52,614 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
-2025-03-03 15:48:52,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
-2025-03-03 15:48:53,066 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
-2025-03-03 15:48:53,066 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=two
-2025-03-03 15:48:56,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
-2025-03-03 15:48:56,863 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
-2025-03-03 15:48:57,087 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r2_c0.jpg
-2025-03-03 15:48:57,301 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r3_c0.jpg
-2025-03-03 15:48:57,526 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r4_c0.jpg
-2025-03-03 15:48:57,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r5_c0.jpg
-2025-03-03 15:48:57,759 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
-2025-03-03 15:49:01,116 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
-2025-03-03 15:49:01,407 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
-2025-03-03 15:49:01,618 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
-2025-03-03 15:49:01,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
-2025-03-03 15:49:01,847 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
-2025-03-03 15:49:04,977 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
-2025-03-03 15:49:05,258 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
-2025-03-03 15:49:05,498 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
-2025-03-03 15:49:05,712 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
-2025-03-03 15:49:05,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
-2025-03-03 15:49:06,162 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
-2025-03-03 15:49:06,385 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
-2025-03-03 15:49:06,612 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
-2025-03-03 15:49:06,613 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
-2025-03-03 15:49:10,036 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
-2025-03-03 15:49:10,328 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
-2025-03-03 15:49:10,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
-2025-03-03 15:49:10,777 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r3_c0.jpg
-2025-03-03 15:49:10,780 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/we_ars/final_subtopics.json
-2025-03-03 15:49:11,098 [INFO] __main__ - GPU memory cleaned up.
-2025-03-03 15:49:11,106 [INFO] __main__ - Processing completed successfully.
-2025-03-03 15:53:27,401 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
-2025-03-03 15:53:28,230 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
-2025-03-03 15:53:28,231 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
-2025-03-03 15:53:28,557 [INFO] __main__ - Computed global offset: 4
-2025-03-03 15:53:28,557 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
-2025-03-03 15:54:23,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
-2025-03-03 15:54:25,210 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
-2025-03-03 15:54:25,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
-2025-03-03 15:54:26,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
-2025-03-03 15:54:26,794 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
-2025-03-03 15:54:27,347 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
-2025-03-03 15:54:27,803 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
-2025-03-03 15:54:28,391 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
-2025-03-03 15:54:28,891 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
-2025-03-03 15:54:29,437 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
-2025-03-03 15:54:29,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
-2025-03-03 15:54:30,421 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
-2025-03-03 15:54:30,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
-2025-03-03 15:54:31,438 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
-2025-03-03 15:54:32,029 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
-2025-03-03 15:54:32,600 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
-2025-03-03 15:54:33,157 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
-2025-03-03 15:54:33,444 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
-2025-03-03 15:54:33,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
-2025-03-03 15:54:34,554 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
-2025-03-03 15:54:35,147 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
-2025-03-03 15:54:35,680 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
-2025-03-03 15:54:36,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
-2025-03-03 15:54:36,554 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
-2025-03-03 15:54:37,089 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
-2025-03-03 15:54:37,502 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
-2025-03-03 15:54:38,008 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
-2025-03-03 15:54:38,585 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
-2025-03-03 15:54:39,068 [INFO] __main__ - Classifying images to detect tables.
-2025-03-03 15:54:42,753 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
-2025-03-03 15:54:46,419 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
-2025-03-03 15:54:46,711 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
-2025-03-03 15:54:46,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
-2025-03-03 15:54:47,110 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
-2025-03-03 15:54:47,110 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
-2025-03-03 15:54:50,464 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
-2025-03-03 15:54:50,784 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
-2025-03-03 15:54:50,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
-2025-03-03 15:54:51,228 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
-2025-03-03 15:54:51,462 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
-2025-03-03 15:54:51,463 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
-2025-03-03 15:54:55,079 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
-2025-03-03 15:54:55,364 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
-2025-03-03 15:54:55,570 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r1_c0.jpg
-2025-03-03 15:54:55,571 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
-2025-03-03 15:54:58,838 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
-2025-03-03 15:54:59,144 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
-2025-03-03 15:54:59,326 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
-2025-03-03 15:54:59,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
-2025-03-03 15:54:59,578 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
-2025-03-03 15:55:03,518 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
-2025-03-03 15:55:03,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
-2025-03-03 15:55:03,983 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
-2025-03-03 15:55:04,202 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
-2025-03-03 15:55:04,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
-2025-03-03 15:55:04,417 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
-2025-03-03 15:55:08,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
-2025-03-03 15:55:08,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
-2025-03-03 15:55:08,629 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
-2025-03-03 15:55:08,816 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
-2025-03-03 15:55:08,816 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
-2025-03-03 15:55:12,344 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
-2025-03-03 15:55:12,644 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
-2025-03-03 15:55:12,867 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
-2025-03-03 15:55:13,114 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
-2025-03-03 15:55:13,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c1.jpg
-2025-03-03 15:55:13,344 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
-2025-03-03 15:55:16,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
-2025-03-03 15:55:17,140 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
-2025-03-03 15:55:17,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c2.jpg
-2025-03-03 15:55:17,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
-2025-03-03 15:55:18,019 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
-2025-03-03 15:55:18,320 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c2.jpg
-2025-03-03 15:55:18,619 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
-2025-03-03 15:55:18,911 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c1.jpg
-2025-03-03 15:55:19,208 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
-2025-03-03 15:55:19,491 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c1.jpg
-2025-03-03 15:55:19,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
-2025-03-03 15:55:20,093 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c1.jpg
-2025-03-03 15:55:20,406 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c0.jpg
-2025-03-03 15:55:20,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c1.jpg
-2025-03-03 15:55:20,690 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
-2025-03-03 15:55:24,558 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
-2025-03-03 15:55:24,859 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c1.jpg
-2025-03-03 15:55:25,142 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c2.jpg
-2025-03-03 15:55:25,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
-2025-03-03 15:55:25,738 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c1.jpg
-2025-03-03 15:55:26,031 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c0.jpg
-2025-03-03 15:55:26,335 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c1.jpg
-2025-03-03 15:55:26,616 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c0.jpg
-2025-03-03 15:55:26,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c1.jpg
-2025-03-03 15:55:26,909 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
-2025-03-03 15:55:30,379 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
-2025-03-03 15:55:30,667 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
-2025-03-03 15:55:30,961 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
-2025-03-03 15:55:31,248 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
-2025-03-03 15:55:31,547 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
-2025-03-03 15:55:31,549 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
-2025-03-03 15:55:34,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
-2025-03-03 15:55:34,994 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
-2025-03-03 15:55:35,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
-2025-03-03 15:55:35,558 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
-2025-03-03 15:55:35,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
-2025-03-03 15:55:36,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r5_c0.jpg
-2025-03-03 15:55:36,137 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
-2025-03-03 15:55:39,497 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
-2025-03-03 15:55:39,757 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
-2025-03-03 15:55:40,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
-2025-03-03 15:55:40,345 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
-2025-03-03 15:55:40,666 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
-2025-03-03 15:55:40,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
-2025-03-03 15:55:40,977 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
-2025-03-03 15:55:44,159 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
-2025-03-03 15:55:44,436 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
-2025-03-03 15:55:44,643 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
-2025-03-03 15:55:44,853 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
-2025-03-03 15:55:45,041 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
-2025-03-03 15:55:45,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
-2025-03-03 15:55:45,255 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
-2025-03-03 15:55:49,508 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
-2025-03-03 15:55:49,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c1.jpg
-2025-03-03 15:55:50,075 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
-2025-03-03 15:55:50,355 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c1.jpg
-2025-03-03 15:55:50,647 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r2_c0.jpg
-2025-03-03 15:55:50,978 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r3_c0.jpg
-2025-03-03 15:55:51,295 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c0.jpg
-2025-03-03 15:55:51,582 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c1.jpg
-2025-03-03 15:55:51,855 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r5_c0.jpg
-2025-03-03 15:55:51,856 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
-2025-03-03 15:55:55,882 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
-2025-03-03 15:55:56,182 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
-2025-03-03 15:55:56,463 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
-2025-03-03 15:55:56,727 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c1.jpg
-2025-03-03 15:55:57,005 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
-2025-03-03 15:55:57,301 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
-2025-03-03 15:55:57,584 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r4_c0.jpg
-2025-03-03 15:55:57,584 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
-2025-03-03 15:56:01,615 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
-2025-03-03 15:56:01,906 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
-2025-03-03 15:56:02,222 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
-2025-03-03 15:56:02,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c1.jpg
-2025-03-03 15:56:02,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
-2025-03-03 15:56:03,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
-2025-03-03 15:56:03,393 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c1.jpg
-2025-03-03 15:56:03,676 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
-2025-03-03 15:56:04,667 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r5_c0.jpg
-2025-03-03 15:56:04,667 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
-2025-03-03 15:56:09,007 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
-2025-03-03 15:56:09,286 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
-2025-03-03 15:56:09,520 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
-2025-03-03 15:56:09,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
-2025-03-03 15:56:09,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
-2025-03-03 15:56:10,171 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
-2025-03-03 15:56:10,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
-2025-03-03 15:56:10,610 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r5_c0.jpg
-2025-03-03 15:56:10,610 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
-2025-03-03 15:56:11,718 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
-2025-03-03 15:56:11,899 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
-2025-03-03 15:56:12,081 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
-2025-03-03 15:56:12,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
-2025-03-03 15:56:12,266 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
-2025-03-03 15:56:15,231 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
-2025-03-03 15:56:15,582 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
-2025-03-03 15:56:15,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
-2025-03-03 15:56:16,018 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
-2025-03-03 15:56:16,234 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
-2025-03-03 15:56:16,451 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
-2025-03-03 15:56:16,452 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
-2025-03-03 15:56:20,970 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
-2025-03-03 15:56:21,300 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c1.jpg
-2025-03-03 15:56:21,518 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c0.jpg
-2025-03-03 15:56:21,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c1.jpg
-2025-03-03 15:56:21,742 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
-2025-03-03 15:56:25,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
-2025-03-03 15:56:25,883 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
-2025-03-03 15:56:26,108 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
-2025-03-03 15:56:26,319 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
-2025-03-03 15:56:26,320 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
-2025-03-03 15:56:30,722 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
-2025-03-03 15:56:31,018 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
-2025-03-03 15:56:31,267 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
-2025-03-03 15:56:31,455 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c1.jpg
-2025-03-03 15:56:31,684 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c0.jpg
-2025-03-03 15:56:31,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c1.jpg
-2025-03-03 15:56:32,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r3_c0.jpg
-2025-03-03 15:56:32,136 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
-2025-03-03 15:56:35,410 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
-2025-03-03 15:56:35,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
-2025-03-03 15:56:35,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
-2025-03-03 15:56:36,143 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
-2025-03-03 15:56:36,144 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
-2025-03-03 15:56:39,869 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
-2025-03-03 15:56:40,150 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
-2025-03-03 15:56:40,387 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
-2025-03-03 15:56:40,608 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
-2025-03-03 15:56:40,828 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
-2025-03-03 15:56:40,829 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=two
-2025-03-03 15:56:44,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
-2025-03-03 15:56:44,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
-2025-03-03 15:56:44,728 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r2_c0.jpg
-2025-03-03 15:56:44,929 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r3_c0.jpg
-2025-03-03 15:56:45,153 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r4_c0.jpg
-2025-03-03 15:56:45,372 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r5_c0.jpg
-2025-03-03 15:56:45,372 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
-2025-03-03 15:56:48,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
-2025-03-03 15:56:48,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
-2025-03-03 15:56:49,036 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
-2025-03-03 15:56:49,282 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
-2025-03-03 15:56:49,282 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
-2025-03-03 15:56:52,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
-2025-03-03 15:56:52,664 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
-2025-03-03 15:56:52,887 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
-2025-03-03 15:56:53,103 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
-2025-03-03 15:56:53,329 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
-2025-03-03 15:56:53,543 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
-2025-03-03 15:56:53,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
-2025-03-03 15:56:53,978 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
-2025-03-03 15:56:53,979 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
-2025-03-03 15:56:57,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
-2025-03-03 15:56:57,690 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
-2025-03-03 15:56:57,897 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
-2025-03-03 15:56:58,126 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r3_c0.jpg
-2025-03-03 15:56:58,131 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/we_ars/final_subtopics.json
-2025-03-03 15:56:58,438 [INFO] __main__ - GPU memory cleaned up.
-2025-03-03 15:56:58,445 [INFO] __main__ - Processing completed successfully.
-2025-03-03 17:28:40,888 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
-2025-03-03 17:28:41,627 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
-2025-03-03 17:28:41,628 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
-2025-03-03 17:28:41,960 [INFO] __main__ - Computed global offset: 4
-2025-03-03 17:28:41,961 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
-2025-03-03 17:29:47,681 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
-2025-03-03 17:29:50,244 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
-2025-03-03 17:29:50,897 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
-2025-03-03 17:29:51,556 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
-2025-03-03 17:29:52,183 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
-2025-03-03 17:29:52,887 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
-2025-03-03 17:29:53,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
-2025-03-03 17:29:54,194 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
-2025-03-03 17:29:54,820 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
-2025-03-03 17:29:55,457 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
-2025-03-03 17:29:56,019 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
-2025-03-03 17:29:56,666 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
-2025-03-03 17:29:57,238 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
-2025-03-03 17:29:57,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
-2025-03-03 17:29:58,524 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
-2025-03-03 17:29:59,210 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
-2025-03-03 17:29:59,902 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
-2025-03-03 17:30:00,309 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
-2025-03-03 17:30:01,021 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
-2025-03-03 17:30:01,692 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
-2025-03-03 17:30:02,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
-2025-03-03 17:30:03,066 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
-2025-03-03 17:30:03,630 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
-2025-03-03 17:30:04,225 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
-2025-03-03 17:30:04,890 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
-2025-03-03 17:30:05,488 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
-2025-03-03 17:30:06,047 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
-2025-03-03 17:30:06,794 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
-2025-03-03 17:30:07,237 [INFO] __main__ - Classifying images to detect tables.
-2025-03-03 17:30:11,295 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
-2025-03-03 17:30:15,135 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
-2025-03-03 17:30:15,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
-2025-03-03 17:30:15,662 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
-2025-03-03 17:30:15,897 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
-2025-03-03 17:30:15,898 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
-2025-03-03 17:30:20,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
-2025-03-03 17:30:21,085 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
-2025-03-03 17:30:21,321 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
-2025-03-03 17:30:21,556 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c1.jpg
-2025-03-03 17:30:21,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
-2025-03-03 17:30:22,035 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
-2025-03-03 17:30:22,036 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
-2025-03-03 17:30:27,289 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
-2025-03-03 17:30:27,603 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
-2025-03-03 17:30:27,603 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
-2025-03-03 17:30:33,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
-2025-03-03 17:30:33,573 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
-2025-03-03 17:30:33,831 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
-2025-03-03 17:30:34,027 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
-2025-03-03 17:30:34,028 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
-2025-03-03 17:30:39,478 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
-2025-03-03 17:30:39,772 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
-2025-03-03 17:30:39,984 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
-2025-03-03 17:30:40,240 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
-2025-03-03 17:30:40,466 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
-2025-03-03 17:30:40,467 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
-2025-03-03 17:30:44,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
-2025-03-03 17:30:45,224 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
-2025-03-03 17:30:45,474 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
-2025-03-03 17:30:45,669 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
-2025-03-03 17:30:45,909 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r2_c0.jpg
-2025-03-03 17:30:45,910 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
-2025-03-03 17:30:50,049 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
-2025-03-03 17:30:50,338 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
-2025-03-03 17:30:50,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
-2025-03-03 17:30:50,772 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c1.jpg
-2025-03-03 17:30:51,001 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
-2025-03-03 17:30:51,001 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
-2025-03-03 17:30:54,784 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
-2025-03-03 17:30:55,093 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
-2025-03-03 17:30:55,328 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
-2025-03-03 17:30:55,552 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
-2025-03-03 17:30:55,777 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
-2025-03-03 17:30:56,026 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
-2025-03-03 17:30:56,240 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
-2025-03-03 17:30:56,240 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
-2025-03-03 17:31:00,457 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
-2025-03-03 17:31:00,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
-2025-03-03 17:31:00,760 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
-2025-03-03 17:31:04,717 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
-2025-03-03 17:31:04,985 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
-2025-03-03 17:31:05,239 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
-2025-03-03 17:31:05,455 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
-2025-03-03 17:31:05,683 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
-2025-03-03 17:31:05,684 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
-2025-03-03 17:31:10,692 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
-2025-03-03 17:31:11,003 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
-2025-03-03 17:31:11,245 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
-2025-03-03 17:31:11,435 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
-2025-03-03 17:31:11,655 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
-2025-03-03 17:31:11,655 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
-2025-03-03 17:31:15,894 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
-2025-03-03 17:31:16,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
-2025-03-03 17:31:16,433 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
-2025-03-03 17:31:16,670 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
-2025-03-03 17:31:16,928 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
-2025-03-03 17:31:17,120 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
-2025-03-03 17:31:17,120 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
-2025-03-03 17:31:20,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
-2025-03-03 17:31:21,154 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
-2025-03-03 17:31:21,398 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
-2025-03-03 17:31:21,637 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
-2025-03-03 17:31:21,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
-2025-03-03 17:31:22,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
-2025-03-03 17:31:22,095 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=two
-2025-03-03 17:31:27,406 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
-2025-03-03 17:31:27,685 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
-2025-03-03 17:31:27,686 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
-2025-03-03 17:31:32,916 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
-2025-03-03 17:31:33,211 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
-2025-03-03 17:31:33,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
-2025-03-03 17:31:33,672 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
-2025-03-03 17:31:33,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
-2025-03-03 17:31:33,904 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
-2025-03-03 17:31:39,209 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
-2025-03-03 17:31:39,525 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
-2025-03-03 17:31:39,778 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
-2025-03-03 17:31:40,003 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
-2025-03-03 17:31:40,232 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c1.jpg
-2025-03-03 17:31:40,479 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
-2025-03-03 17:31:40,707 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
-2025-03-03 17:31:40,708 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
-2025-03-03 17:31:45,922 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
-2025-03-03 17:31:46,235 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
-2025-03-03 17:31:46,463 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
-2025-03-03 17:31:46,691 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
-2025-03-03 17:31:46,878 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
-2025-03-03 17:31:47,130 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
-2025-03-03 17:31:47,375 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
-2025-03-03 17:31:47,376 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
-2025-03-03 17:31:49,248 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
-2025-03-03 17:31:49,508 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
-2025-03-03 17:31:49,702 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
-2025-03-03 17:31:49,890 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
-2025-03-03 17:31:49,891 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
-2025-03-03 17:31:53,834 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
-2025-03-03 17:31:54,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
-2025-03-03 17:31:54,379 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
-2025-03-03 17:31:54,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
-2025-03-03 17:31:54,793 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
-2025-03-03 17:31:55,019 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
-2025-03-03 17:31:55,019 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=two
-2025-03-03 17:32:00,652 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
-2025-03-03 17:32:00,653 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
-2025-03-03 17:32:05,661 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
-2025-03-03 17:32:05,960 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
-2025-03-03 17:32:06,196 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
-2025-03-03 17:32:06,457 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
-2025-03-03 17:32:06,707 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r2_c0.jpg
-2025-03-03 17:32:06,940 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r2_c1.jpg
-2025-03-03 17:32:06,941 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
-2025-03-03 17:32:12,376 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
-2025-03-03 17:32:12,703 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
-2025-03-03 17:32:12,940 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
-2025-03-03 17:32:12,941 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
-2025-03-03 17:32:17,156 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
-2025-03-03 17:32:17,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
-2025-03-03 17:32:17,698 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
-2025-03-03 17:32:17,937 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
-2025-03-03 17:32:17,938 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
-2025-03-03 17:32:23,150 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
-2025-03-03 17:32:23,450 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
-2025-03-03 17:32:23,675 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
-2025-03-03 17:32:23,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
-2025-03-03 17:32:24,135 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
-2025-03-03 17:32:24,136 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
-2025-03-03 17:32:29,269 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
-2025-03-03 17:32:29,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c1.jpg
-2025-03-03 17:32:29,771 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
-2025-03-03 17:32:30,016 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c1.jpg
-2025-03-03 17:32:30,016 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
-2025-03-03 17:32:34,291 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
-2025-03-03 17:32:34,576 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
-2025-03-03 17:32:34,811 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
-2025-03-03 17:32:35,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c1.jpg
-2025-03-03 17:32:35,298 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
-2025-03-03 17:32:35,299 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
-2025-03-03 17:32:39,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
-2025-03-03 17:32:39,710 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
-2025-03-03 17:32:39,965 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
-2025-03-03 17:32:40,181 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
-2025-03-03 17:32:40,393 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
-2025-03-03 17:32:40,629 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
-2025-03-03 17:32:40,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
-2025-03-03 17:32:41,080 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
-2025-03-03 17:32:41,080 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
-2025-03-03 17:32:45,688 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
-2025-03-03 17:32:45,999 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
-2025-03-03 17:32:46,226 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
-2025-03-03 17:32:46,462 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c1.jpg
-2025-03-03 17:32:46,468 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/we_ars/final_subtopics.json
-2025-03-03 17:32:46,930 [INFO] __main__ - GPU memory cleaned up.
-2025-03-03 17:32:46,940 [INFO] __main__ - Processing completed successfully.
-2025-03-03 17:42:37,923 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
-2025-03-03 17:42:38,720 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
-2025-03-03 17:42:38,721 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
-2025-03-03 17:42:39,089 [INFO] __main__ - Computed global offset: 4
-2025-03-03 17:42:39,090 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
-2025-03-03 17:43:33,813 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
-2025-03-03 17:43:35,535 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
-2025-03-03 17:43:36,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
-2025-03-03 17:43:36,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
-2025-03-03 17:43:37,282 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
-2025-03-03 17:43:37,857 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
-2025-03-03 17:43:38,322 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
-2025-03-03 17:43:38,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
-2025-03-03 17:43:39,279 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
-2025-03-03 17:43:39,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
-2025-03-03 17:43:40,400 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
-2025-03-03 17:43:40,940 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
-2025-03-03 17:43:41,381 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
-2025-03-03 17:43:41,964 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
-2025-03-03 17:43:42,436 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
-2025-03-03 17:43:42,967 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
-2025-03-03 17:43:43,518 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
-2025-03-03 17:43:43,822 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
-2025-03-03 17:43:44,428 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
-2025-03-03 17:43:44,963 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
-2025-03-03 17:43:45,639 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
-2025-03-03 17:43:46,199 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
-2025-03-03 17:43:46,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
-2025-03-03 17:43:47,259 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
-2025-03-03 17:43:47,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
-2025-03-03 17:43:48,235 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
-2025-03-03 17:43:48,656 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
-2025-03-03 17:43:49,290 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
-2025-03-03 17:43:49,683 [INFO] __main__ - Classifying images to detect tables.
-2025-03-03 17:43:53,784 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
-2025-03-03 17:43:56,550 [ERROR] __main__ - Error processing table image /topic-extraction/img_1.jpg: [Errno 2] No such file or directory: '/tmp/tmp63t8um4x.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:43:56,550 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
-2025-03-03 17:43:59,443 [ERROR] __main__ - Error processing table image /topic-extraction/img_2.jpg: [Errno 2] No such file or directory: '/tmp/tmps0rsmzl6.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:43:59,443 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
-2025-03-03 17:44:02,428 [ERROR] __main__ - Error processing table image /topic-extraction/img_3.jpg: [Errno 2] No such file or directory: '/tmp/tmpj4fx8a9s.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:02,429 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
-2025-03-03 17:44:05,216 [ERROR] __main__ - Error processing table image /topic-extraction/img_4.jpg: [Errno 2] No such file or directory: '/tmp/tmpmumoju32.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:05,216 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
-2025-03-03 17:44:08,445 [ERROR] __main__ - Error processing table image /topic-extraction/img_5.jpg: [Errno 2] No such file or directory: '/tmp/tmptekcelbx.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:08,445 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
-2025-03-03 17:44:11,635 [ERROR] __main__ - Error processing table image /topic-extraction/img_6.jpg: [Errno 2] No such file or directory: '/tmp/tmpi4bsuwn6.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:11,635 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
-2025-03-03 17:44:14,589 [ERROR] __main__ - Error processing table image /topic-extraction/img_7.jpg: [Errno 2] No such file or directory: '/tmp/tmpj_8l15kk.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:14,589 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
-2025-03-03 17:44:17,836 [ERROR] __main__ - Error processing table image /topic-extraction/img_8.jpg: [Errno 2] No such file or directory: '/tmp/tmp3_kflaqs.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:17,837 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
-2025-03-03 17:44:21,255 [ERROR] __main__ - Error processing table image /topic-extraction/img_9.jpg: [Errno 2] No such file or directory: '/tmp/tmpwuir45y0.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:21,255 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
-2025-03-03 17:44:24,155 [ERROR] __main__ - Error processing table image /topic-extraction/img_10.jpg: [Errno 2] No such file or directory: '/tmp/tmpu2qia4ih.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:24,155 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
-2025-03-03 17:44:27,346 [ERROR] __main__ - Error processing table image /topic-extraction/img_11.jpg: [Errno 2] No such file or directory: '/tmp/tmp5ucu_tbp.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:27,346 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
-2025-03-03 17:44:30,489 [ERROR] __main__ - Error processing table image /topic-extraction/img_12.jpg: [Errno 2] No such file or directory: '/tmp/tmp_ciyju4y.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:30,489 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
-2025-03-03 17:44:33,140 [ERROR] __main__ - Error processing table image /topic-extraction/img_13.jpg: [Errno 2] No such file or directory: '/tmp/tmp1_mz16x9.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:33,141 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=two
-2025-03-03 17:44:36,423 [ERROR] __main__ - Error processing table image /topic-extraction/img_14.jpg: [Errno 2] No such file or directory: '/tmp/tmp_44dh1m3.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:36,423 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
-2025-03-03 17:44:39,622 [ERROR] __main__ - Error processing table image /topic-extraction/img_15.jpg: [Errno 2] No such file or directory: '/tmp/tmp4e3y3440.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:39,623 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
-2025-03-03 17:44:42,896 [ERROR] __main__ - Error processing table image /topic-extraction/img_16.jpg: [Errno 2] No such file or directory: '/tmp/tmp2njdfsc6.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:42,896 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
-2025-03-03 17:44:46,043 [ERROR] __main__ - Error processing table image /topic-extraction/img_17.jpg: [Errno 2] No such file or directory: '/tmp/tmpwq0nk28o.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:46,044 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
-2025-03-03 17:44:47,088 [ERROR] __main__ - Error processing table image /topic-extraction/img_18.jpg: [Errno 2] No such file or directory: '/tmp/tmpdx8gcoqg.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:47,089 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
-2025-03-03 17:44:49,477 [ERROR] __main__ - Error processing table image /topic-extraction/img_19.jpg: [Errno 2] No such file or directory: '/tmp/tmp72627l8g.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:49,478 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=two
-2025-03-03 17:44:53,018 [ERROR] __main__ - Error processing table image /topic-extraction/img_20.jpg: [Errno 2] No such file or directory: '/tmp/tmpdnic1_0w.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:53,019 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
-2025-03-03 17:44:56,093 [ERROR] __main__ - Error processing table image /topic-extraction/img_21.jpg: [Errno 2] No such file or directory: '/tmp/tmpmhoh8yuy.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:56,093 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
-2025-03-03 17:44:59,613 [ERROR] __main__ - Error processing table image /topic-extraction/img_22.jpg: [Errno 2] No such file or directory: '/tmp/tmp7ted27c7.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:44:59,613 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
-2025-03-03 17:45:02,646 [ERROR] __main__ - Error processing table image /topic-extraction/img_23.jpg: [Errno 2] No such file or directory: '/tmp/tmpbr3_k9_v.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:45:02,646 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
-2025-03-03 17:45:06,144 [ERROR] __main__ - Error processing table image /topic-extraction/img_24.jpg: [Errno 2] No such file or directory: '/tmp/tmpg6iw11r9.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:45:06,145 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
-2025-03-03 17:45:09,409 [ERROR] __main__ - Error processing table image /topic-extraction/img_25.jpg: [Errno 2] No such file or directory: '/tmp/tmp_ntakmkl.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:45:09,410 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
-2025-03-03 17:45:12,057 [ERROR] __main__ - Error processing table image /topic-extraction/img_26.jpg: [Errno 2] No such file or directory: '/tmp/tmp0k8i_n4p.jpg_rows/row_0/col_0.jpg'
-2025-03-03 17:45:12,057 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
-2025-03-03 17:45:14,839 [INFO] __main__ - GPU memory cleaned up.

we/final_subtopics.json DELETED Viewed

@@ -1,1139 +0,0 @@
-[
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "1.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_1.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "2 Algebra and functions",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "2.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_2.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "2.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_2.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "2.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_2.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "2.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_2.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "2 Algebra and functions continued",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "2.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_3.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "2.6",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_3.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_4.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "2.7",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_4.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_5.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "2.8",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_5.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "2.9",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_5.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "2 Algebra and functions continued",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_6.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "2.11",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_6.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "3.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_6.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "3 Coordinate geometry in the (x, y) plane continued",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_7.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "3.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_7.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "3.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_7.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "4.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_7.jpg_r2_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_8.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "4.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_8.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "4.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_8.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "4.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_8.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "4.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_8.jpg_r4_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "4.6",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_8.jpg_r5_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "gonometry",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "5.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_9.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "5.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_9.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "5.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_9.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "5.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_9.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "5 Trigonometry continued",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_10.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "5.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_10.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "5.6",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_10.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "5.7",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_10.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "5.8",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_10.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "",
-    "contents": [],
-    "children": [
-      {
-        "title": "6.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_11.jpg_r0_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "6.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_11.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "6.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_11.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "6.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_11.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "6.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_11.jpg_r4_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "6.6",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_11.jpg_r5_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_12.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "6.7",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_12.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_12.jpg_r2_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_13.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "7.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_13.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_13.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_13.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_14.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "7.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_14.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_14.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.6",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_14.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_14.jpg_r4_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_14.jpg_r5_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "8.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_15.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_15.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_15.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.6",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_15.jpg_r4_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "8.7",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_16.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.8",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_16.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "9.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_16.jpg_r3_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "9.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_16.jpg_r4_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "9.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_16.jpg_r5_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "9 Numerical methods",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_17.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "9.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_17.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "9.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_17.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "10.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_17.jpg_r2_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "10.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_17.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "10.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_17.jpg_r4_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "10.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_17.jpg_r5_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "10.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_18.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "1.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_19.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "2.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_19.jpg_r2_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "2.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_20.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "2 Data presentation and interpretation continued",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_21.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "2.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_21.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "3.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_21.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "3.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_22.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "4.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_22.jpg_r2_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "4.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_22.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "4 Statistical distributions continued",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "4.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_23.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "5.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_23.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_24.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "5.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_24.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "5.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_24.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "",
-    "contents": [],
-    "children": [
-      {
-        "title": "7.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_25.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_25.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_25.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_25.jpg_r4_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "7.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_25.jpg_r5_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "8 Forces and Newton's laws",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_26.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "8.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_26.jpg_r0_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.2",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_26.jpg_r1_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.3",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_26.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Topics",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
-      }
-    ],
-    "children": [
-      {
-        "title": "8.4",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_27.jpg_r1_c1.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.5",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_27.jpg_r2_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "8.6",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_27.jpg_r3_c0.png"
-          }
-        ],
-        "children": []
-      },
-      {
-        "title": "9.1",
-        "contents": [
-          {
-            "type": "image",
-            "key": "/topic-extraction/cells/img_27.jpg_r4_c1.png"
-          }
-        ],
-        "children": []
-      }
-    ]
-  },
-  {
-    "title": "Reason, interpret and communicate mathematically",
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/cells/img_28.jpg_r1_c0.png"
-      }
-    ],
-    "children": []
-  }
-]

we/we_ars/final_subtopics.json DELETED Viewed

@@ -1,282 +0,0 @@
-[
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_1.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_2.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_3.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_4.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_5.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_6.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_7.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_8.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_9.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_10.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_11.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_12.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_13.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_14.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_15.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_16.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_17.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_18.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_19.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_20.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_21.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_22.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_23.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_24.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_25.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_26.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_27.jpg"
-      }
-    ],
-    "children": []
-  },
-  {
-    "title": null,
-    "contents": [
-      {
-        "type": "image",
-        "key": "/topic-extraction/img_28.jpg"
-      }
-    ],
-    "children": []
-  }
-]