SkyNait commited on
Commit
98af312
·
1 Parent(s): 2c394b4

correct JSON and filtering

Browse files
topic_extr.py DELETED
@@ -1,972 +0,0 @@
1
- #!/usr/bin/env python3
2
- import os
3
- import re
4
- import gc
5
- import json
6
- import logging
7
- import fitz
8
- import boto3
9
- import base64
10
- import time
11
- import asyncio
12
- import tempfile
13
- import requests
14
- from io import BytesIO
15
- from typing import List, Dict, Any
16
-
17
- import torch
18
- import cv2
19
- import numpy as np
20
-
21
- from google import genai
22
- from google.genai import types
23
-
24
- from magic_pdf.data.dataset import PymuDocDataset
25
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
26
- from magic_pdf.data.data_reader_writer.base import DataWriter
27
- from table_row_extraction import TableExtractor
28
-
29
- logging.basicConfig(level=logging.INFO)
30
- logger = logging.getLogger(__name__)
31
- logger.setLevel(logging.INFO)
32
- file_handler = logging.FileHandler("topic_extraction.log")
33
- file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
34
- logger.addHandler(file_handler)
35
-
36
- _GEMINI_CLIENT = None
37
-
38
- #helper functions, also global
39
- def unify_whitespace(text: str) -> str:
40
- return re.sub(r"\s+", " ", text).strip()
41
-
42
- def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
43
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
44
- st_norm = unify_whitespace(search_text)
45
- found = []
46
- for i in range(doc.page_count):
47
- raw = doc[i].get_text("raw")
48
- norm = unify_whitespace(raw)
49
- if st_norm in norm:
50
- found.append(i)
51
- doc.close()
52
- return sorted(found)
53
-
54
- def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
55
- if not page_indices:
56
- raise ValueError("No page indices provided for subset creation.")
57
- doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
58
- new_doc = fitz.open()
59
- for p in sorted(set(page_indices)):
60
- if 0 <= p < doc.page_count:
61
- new_doc.insert_pdf(doc, from_page=p, to_page=p)
62
- else:
63
- logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
64
- raise ValueError(f"Page index {p} out of range.")
65
- subset_bytes = new_doc.tobytes()
66
- new_doc.close()
67
- doc.close()
68
- return subset_bytes
69
-
70
- def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
71
- """
72
- Clean up a topic title:
73
- - Remove any trailing "continued".
74
- - If the title does not start with a number but children provide a consistent numeric prefix,
75
- then prepend that prefix.
76
- """
77
- title = raw_title.strip()
78
- # Remove trailing "continued"
79
- title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
80
-
81
- # If title already starts with a number, use it as is.
82
- if re.match(r"^\d+", title):
83
- return title
84
-
85
- # Otherwise, try to deduce a numeric prefix from the children.
86
- prefixes = []
87
- for child in children_subtopics:
88
- child_title = child.get("title", "").strip()
89
- m = re.match(r"^(\d+)\.", child_title)
90
- if m:
91
- prefixes.append(m.group(1))
92
- if prefixes:
93
- # If all numeric prefixes in children are the same, use that prefix.
94
- if all(p == prefixes[0] for p in prefixes):
95
- # If title is non-empty, prepend the number; otherwise, use a fallback.
96
- if title:
97
- title = f"{prefixes[0]} {title}"
98
- else:
99
- title = f"{prefixes[0]} Topic"
100
- # Optionally, handle known broken titles explicitly.
101
- if title.lower() in {"gonometry"}:
102
- # For example, if children indicate "5.X", set to "5 Trigonometry"
103
- if prefixes and prefixes[0] == "5":
104
- title = "5 Trigonometry"
105
- return title
106
-
107
-
108
- def merge_topics(subtopic_list: list) -> list:
109
- """
110
- Merge topics with an enhanced logic:
111
- 1. Clean up each topic's title using unify_topic_name.
112
- 2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
113
- 3. Reassign children: for each child whose title (e.g. "3.1") does not match its current parent's numeric prefix,
114
- move it to the parent with the matching prefix if available.
115
- 4. Remove duplicate children by merging contents.
116
- 5. Sort parent topics and each parent's children by their numeric ordering.
117
- """
118
- # First, merge topics by parent's numeric prefix.
119
- merged = {}
120
- for topic_obj in subtopic_list:
121
- raw_title = topic_obj.get("title", "")
122
- children = topic_obj.get("children", [])
123
- contents = topic_obj.get("contents", [])
124
- new_title = unify_topic_name(raw_title, children)
125
- # Extract parent's numeric prefix, if present.
126
- m = re.match(r"^(\d+)", new_title)
127
- parent_prefix = m.group(1) if m else None
128
- key = parent_prefix if parent_prefix is not None else new_title
129
-
130
- if key not in merged:
131
- merged[key] = {
132
- "title": new_title,
133
- "contents": list(contents),
134
- "children": list(children),
135
- }
136
- else:
137
- # Merge contents and children; choose the longer title.
138
- if len(new_title) > len(merged[key]["title"]):
139
- merged[key]["title"] = new_title
140
- merged[key]["contents"].extend(contents)
141
- merged[key]["children"].extend(children)
142
-
143
- # Build a lookup of merged topics by their numeric prefix.
144
- parent_lookup = merged # keys are numeric prefixes or the full title for non-numeric ones.
145
-
146
- # Reassign children to the correct parent based on their numeric prefix.
147
- for key, topic in merged.items():
148
- new_children = []
149
- for child in topic["children"]:
150
- child_title = child.get("title", "").strip()
151
- m_child = re.match(r"^(\d+)\.", child_title)
152
- if m_child:
153
- child_prefix = m_child.group(1)
154
- if key != child_prefix and child_prefix in parent_lookup:
155
- # Reassign this child to the proper parent.
156
- parent_lookup[child_prefix]["children"].append(child)
157
- continue
158
- new_children.append(child)
159
- topic["children"] = new_children
160
-
161
- # Remove duplicate children by merging their contents.
162
- for topic in merged.values():
163
- child_map = {}
164
- for child in topic["children"]:
165
- ctitle = child.get("title", "").strip()
166
- if ctitle not in child_map:
167
- child_map[ctitle] = child
168
- else:
169
- child_map[ctitle]["contents"].extend(child.get("contents", []))
170
- child_map[ctitle]["children"].extend(child.get("children", []))
171
- topic["children"] = list(child_map.values())
172
-
173
- # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
174
- def parse_subtopic_num(subtitle):
175
- digits = re.findall(r"\d+", subtitle)
176
- return tuple(int(d) for d in digits) if digits else (9999,)
177
- topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
178
-
179
- # Convert merged topics to a sorted list.
180
- def parse_parent_num(topic):
181
- m = re.match(r"^(\d+)", topic.get("title", ""))
182
- return int(m.group(1)) if m else 9999
183
- final_list = list(merged.values())
184
- final_list.sort(key=lambda topic: parse_parent_num(topic))
185
- return final_list
186
-
187
- class s3Writer:
188
- def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
189
- self.bucket = bucket
190
- self.client = boto3.client(
191
- 's3',
192
- aws_access_key_id=ak,
193
- aws_secret_access_key=sk,
194
- endpoint_url=endpoint_url
195
- )
196
-
197
- def write(self, path: str, data: bytes) -> None:
198
- try:
199
- file_obj = BytesIO(data)
200
- self.client.upload_fileobj(
201
- file_obj,
202
- self.bucket,
203
- path
204
- )
205
- logger.info(f"Uploaded to S3: {path}")
206
- except Exception as e:
207
- logger.error(f"Failed to upload to S3: {str(e)}")
208
- raise
209
-
210
- def delete(self, path: str) -> None:
211
- try:
212
- self.client.delete_object(Bucket=self.bucket, Key=path)
213
- except Exception as e:
214
- logger.error(f"Failed to delete from S3: {str(e)}")
215
- raise
216
-
217
- def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
218
- arr = np.frombuffer(image_data, np.uint8)
219
- img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
220
- if img is not None:
221
- h, w, _ = img.shape
222
- if max(h, w) > max_dim:
223
- scale = max_dim / float(max(h, w))
224
- new_w = int(w * scale)
225
- new_h = int(h * scale)
226
- img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
227
- encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
228
- success, enc = cv2.imencode(".jpg", img, encode_params)
229
- if success:
230
- return enc.tobytes()
231
- return image_data
232
-
233
- def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
234
- """
235
- Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
236
- """
237
- for attempt in range(max_retries + 1):
238
- try:
239
- prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
240
- The three-column 'table' image includes such key features:
241
- - Three columns header
242
- - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
243
- - Possibly sections (e.g. 8.4, 9.1)
244
- The two-column 'table' image includes such key features:
245
- - Two columns
246
- - Headers like 'Subject content', 'Additional information'
247
- - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
248
- If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
249
- If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
250
- If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
251
- Return only one of these exact labels.
252
- """
253
- global _GEMINI_CLIENT
254
- if _GEMINI_CLIENT is None:
255
- _GEMINI_CLIENT = genai.Client(api_key=api_key)
256
- client = _GEMINI_CLIENT
257
-
258
- resp = client.models.generate_content(
259
- model="gemini-2.0-flash",
260
- contents=[
261
- {
262
- "parts": [
263
- {"text": prompt},
264
- {
265
- "inline_data": {
266
- "mime_type": "image/jpeg",
267
- "data": base64.b64encode(image_data).decode('utf-8')
268
- }
269
- }
270
- ]
271
- }
272
- ],
273
- config=types.GenerateContentConfig(temperature=0.0)
274
- )
275
- if resp and resp.text:
276
- classification = resp.text.strip().upper()
277
- if "THREE" in classification:
278
- return "THREE_COLUMN"
279
- elif "TWO" in classification:
280
- return "TWO_COLUMN"
281
- elif "EMPTY" in classification:
282
- return "EMPTY_IMAGE"
283
- return "NO_TABLE"
284
- except Exception as e:
285
- logger.error(f"Gemini table classification error: {e}")
286
- if "503" in str(e):
287
- return "NO_TABLE"
288
- if attempt < max_retries:
289
- time.sleep(0.5)
290
- else:
291
- return "NO_TABLE"
292
-
293
- async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
294
- loop = asyncio.get_event_loop()
295
- preprocessed = preprocess_image(image_data)
296
- return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
297
-
298
- def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
299
- for attempt in range(max_retries + 1):
300
- try:
301
- prompt = """
302
- You are given an image from an educational curriculum specification. The image may contain:
303
- 1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
304
- 2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6", or "3.4".
305
- 3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
306
- 4) Possibly no relevant text at all.
307
-
308
- Your task is to extract:
309
- - **"title"**: A recognized main topic or heading text.
310
- - **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4"), as an array of strings.
311
-
312
- Follow these rules:
313
-
314
- (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued", (remove the word "continued") then:
315
- - Put that text (without the word "continued") in "title". (e.g. "2 Algebra and functions")
316
- - "subtopics" should be an empty array, unless you also see smaller subtopic numbers.
317
-
318
- (2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4", then:
319
- - Collect those exact strings in the JSON key "subtopics" (an array of strings).
320
- - "title" in this case should be an empty string if you only detect subtopics.
321
- (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
322
-
323
- (3) If no main topic or subtopic is detected but the text appears to be a heading (e.g. "Scarcity, choice and opportunity cost"), return:
324
- {{
325
- "title": "",
326
- "subtopics": []
327
- }}
328
-
329
- (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
330
- - Use the **left column text** as "title".
331
- - "subtopics" remains empty.
332
- Example:
333
- If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
334
- {
335
- "title": "Scarcity, choice and opportunity cost",
336
- "subtopics": []
337
- }
338
-
339
- (5) **If there is a character + digit pattern** in the left column for a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
340
- - Put that label text into "title" (e.g. "G2").
341
- - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
342
-
343
- (6) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
344
- {
345
- "title": "...",
346
- "subtopics": [...]
347
- }
348
-
349
- (7) If the image is blank or truncated, defined as:
350
- - Contains no words at all (e.g. a blank white or black image)
351
- - Contains only a truncated snippet of words such as "Topics", "What students need to learn" with blue background
352
- - Contains a truncated snippet with words like "Topics", "What students need to learn", "Content" with gray background (RGB (166,166,166) or (180,180,180)) then return:
353
- {{
354
- "title": "EMPTY_IMAGE",
355
- "subtopics": []
356
- }}
357
-
358
- **Examples**:
359
-
360
- - If the image text is `"2 Algebra and functions continued"`, return:
361
- {
362
- "title": "2 Algebra and functions",
363
- "subtopics": []
364
- }
365
-
366
- - If the image text is `"2.5 Solve linear and quadratic inequalities ..."`, return:
367
- {
368
- "title": "",
369
- "subtopics": ["2.5"]
370
- }
371
-
372
- - If the image text is `"Scarcity, choice and opportunity cost"` (with no numeric patterns at all), return:
373
- {
374
- "title": "Scarcity, choice and opportunity cost",
375
- "subtopics": []
376
- }
377
-
378
- - If the left column says `"G2"` and the right column has details, but no subtopic numbers, return:
379
- {
380
- "title": "G2",
381
- "subtopics": []
382
- }
383
-
384
- - If you cannot recognize any text matching these patterns, or if nothing is found, return:
385
- {
386
- "title": "",
387
- "subtopics": []
388
- }
389
- """
390
-
391
- global _GEMINI_CLIENT
392
- if _GEMINI_CLIENT is None:
393
- _GEMINI_CLIENT = genai.Client(api_key=api_key)
394
- client = _GEMINI_CLIENT
395
-
396
- resp = client.models.generate_content(
397
- model="gemini-2.0-flash",
398
- contents=[
399
- {
400
- "parts": [
401
- {"text": prompt},
402
- {
403
- "inline_data": {
404
- "mime_type": "image/jpeg",
405
- "data": base64.b64encode(image_data).decode("utf-8")
406
- }
407
- }
408
- ]
409
- }
410
- ],
411
- config=types.GenerateContentConfig(temperature=0.0)
412
- )
413
-
414
- if not resp or not resp.text:
415
- logger.warning("Gemini returned an empty response for subtopic extraction.")
416
- return {"title": "", "subtopics": []}
417
-
418
- raw = resp.text.strip()
419
- # Remove any markdown fences if present
420
- raw = raw.replace("```json", "").replace("```", "").strip()
421
- data = json.loads(raw)
422
-
423
- title = data.get("title", "")
424
- subtopics = data.get("subtopics", [])
425
- if title.upper() == "EMPTY_IMAGE":
426
- return {"title": "EMPTY_IMAGE", "subtopics": []}
427
- if not isinstance(subtopics, list):
428
- subtopics = []
429
- return {"title": title, "subtopics": subtopics}
430
-
431
- except Exception as e:
432
- logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
433
- if attempt < max_retries:
434
- time.sleep(0.5)
435
- else:
436
- return {"title": "", "subtopics": []}
437
-
438
- return {"title": "", "subtopics": []}
439
-
440
- class S3ImageWriter(DataWriter):
441
- def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
442
- self.s3_writer = s3_writer
443
- self.base_path = base_path if base_path.endswith("/") else base_path + "/"
444
- self.gemini_api_key = gemini_api_key
445
- self.descriptions = {}
446
- self._img_count = 0
447
- self.extracted_tables = {}
448
-
449
- self.extracted_subtopics = {}
450
-
451
- def write(self, path: str, data: bytes) -> None:
452
- self._img_count += 1
453
- unique_id = f"img_{self._img_count}.jpg"
454
- s3_key = f"{self.base_path}{unique_id}"
455
- self.s3_writer.write(s3_key, data)
456
- self.descriptions[path] = {
457
- "data": data,
458
- "s3_path": s3_key,
459
- "table_classification": "NO_TABLE",
460
- "final_alt": ""
461
- }
462
-
463
- async def post_process_async(self, key: str, md_content: str) -> str:
464
- logger.info("Classifying images to detect tables.")
465
- tasks = {
466
- p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
467
- for p, info in self.descriptions.items()
468
- }
469
- results = await asyncio.gather(*tasks.values(), return_exceptions=True)
470
- for p, result in zip(list(self.descriptions.keys()), results):
471
- if isinstance(result, Exception):
472
- logger.error(f"Table classification error for {p}: {result}")
473
- self.descriptions[p]['table_classification'] = "NO_TABLE"
474
- else:
475
- self.descriptions[p]['table_classification'] = result
476
-
477
- # Process each image description.
478
- for p, info in list(self.descriptions.items()):
479
- cls = info['table_classification']
480
- if cls == "TWO_COLUMN":
481
- info['final_alt'] = "HAS TO BE PROCESSED - two column table"
482
- elif cls == "THREE_COLUMN":
483
- info['final_alt'] = "HAS TO BE PROCESSED - three column table"
484
- elif cls == "EMPTY_IMAGE":
485
- # Remove markdown reference, delete from descriptions and S3.
486
- md_content = md_content.replace(f"![]({key}{p})", "")
487
- try:
488
- self.s3_writer.delete(info['s3_path'])
489
- except Exception as e:
490
- logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
491
- del self.descriptions[p]
492
- continue
493
- else:
494
- info['final_alt'] = "NO_TABLE image"
495
- md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
496
-
497
- md_content = await self._process_table_images_in_markdown(key, md_content)
498
-
499
- # Filter final lines to keep only lines with images.
500
- final_lines = [
501
- line.strip() for line in md_content.split("\n")
502
- if re.match(r"^\!\[.*\]\(.*\)", line.strip())
503
- ]
504
- return "\n".join(final_lines)
505
-
506
- async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
507
- pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
508
- matches = re.findall(pat, md_content, flags=re.IGNORECASE)
509
- if not matches:
510
- return md_content
511
-
512
- for (col_type, s3_key) in matches:
513
- logger.info(f"Processing table image: {s3_key}, columns={col_type}")
514
- img_data = None
515
- for desc in self.descriptions.values():
516
- if desc.get("s3_path") == s3_key:
517
- img_data = desc.get("data")
518
- break
519
- if img_data is None:
520
- logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
521
- continue
522
-
523
- # Write temporary file for processing.
524
- with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
525
- temp_file.write(img_data)
526
- temp_path = temp_file.name
527
-
528
- try:
529
- if col_type.lower() == 'two':
530
- extractor = TableExtractor(
531
- skip_header=True,
532
- merge_two_col_rows=True,
533
- enable_subtopic_merge=True,
534
- subtopic_threshold=0.2
535
- )
536
- else:
537
- extractor = TableExtractor(
538
- skip_header=True,
539
- merge_two_col_rows=False,
540
- enable_subtopic_merge=False,
541
- subtopic_threshold=0.2
542
- )
543
- row_boxes = extractor.process_image(temp_path)
544
-
545
- # logger.info(f"Extracted {len(row_boxes)} rows from {temp_path}")
546
- # for i, row in enumerate(row_boxes):
547
- # logger.info(f"Row {i} has {len(row)} cells")
548
-
549
- out_folder = temp_path + "_rows"
550
- os.makedirs(out_folder, exist_ok=True)
551
- # out_folder = os.path.join(os.path.dirname(temp_path), os.path.basename(temp_path) + "_rows")
552
- # os.makedirs(out_folder, exist_ok=True)
553
-
554
- extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
555
- #just to print structure how cells are saved and named for each table image
556
- # logger.info(f"Files in {out_folder}:")
557
- # for root, dirs, files in os.walk(out_folder):
558
- # logger.info(f"{root}: {files}")
559
-
560
- recognized_main_topic = ""
561
- main_topic_image_key = None
562
- recognized_subtopics = []
563
-
564
- # Loop over each cell image.
565
- for i, row in enumerate(row_boxes):
566
- row_dir = os.path.join(out_folder, f"row_{i}")
567
- for j, _ in enumerate(row):
568
- cell_path = os.path.join(row_dir, f"col_{j}.png")
569
- if not os.path.isfile(cell_path):
570
- alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
571
- if os.path.isfile(alternative_path):
572
- cell_path = alternative_path
573
- else:
574
- logger.warning(f"Cell image not found: {cell_path}")
575
- continue
576
-
577
- with open(cell_path, "rb") as cf:
578
- cell_image_data = cf.read()
579
-
580
- cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
581
- self.s3_writer.write(cell_key, cell_image_data)
582
-
583
- #extract subtopic info from the cell image.
584
- info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
585
-
586
- # Check if the image is empty.
587
- if info.get("title", "").upper() == "EMPTY_IMAGE":
588
- try:
589
- self.s3_writer.delete(cell_key)
590
- logger.info(f"Deleted empty cell image from S3: {cell_key}")
591
- except Exception as e:
592
- logger.error(f"Error deleting empty cell image {cell_key}: {e}")
593
- continue # Skip processing this cell further
594
-
595
- if info["title"] and not recognized_main_topic:
596
- recognized_main_topic = info["title"]
597
- main_topic_image_key = cell_key
598
-
599
- for st in info["subtopics"]:
600
- recognized_subtopics.append({
601
- "title": st,
602
- "contents": [{"type": "image", "key": cell_key}],
603
- "children": []
604
- })
605
-
606
- final_json = {
607
- "title": recognized_main_topic,
608
- "contents": [],
609
- "children": recognized_subtopics
610
- }
611
- if main_topic_image_key:
612
- final_json["contents"].append({"type": "image", "key": main_topic_image_key})
613
-
614
- # Save the final JSON.
615
- self.extracted_subtopics[s3_key] = final_json
616
-
617
- # Optionally, create a snippet to replace the markdown line.
618
- snippet = ["**Extracted table cells:**"]
619
- for i, row in enumerate(row_boxes):
620
- for j, _ in enumerate(row):
621
- snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
622
- new_snip = "\n".join(snippet)
623
- old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
624
- md_content = md_content.replace(old_line, new_snip)
625
-
626
- except Exception as e:
627
- logger.error(f"Error processing table image {s3_key}: {e}")
628
- finally:
629
- os.remove(temp_path)
630
-
631
- return md_content
632
-
633
- def post_process(self, key: str, md_content: str) -> str:
634
- return asyncio.run(self.post_process_async(key, md_content))
635
-
636
- class GeminiTopicExtractor:
637
- def __init__(self, api_key: str = None, num_pages: int = 14):
638
- self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
639
- self.num_pages = num_pages
640
-
641
- def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
642
- first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
643
- if not first_pages_text.strip():
644
- logger.error("No text from first pages => cannot extract subtopics.")
645
- return {}
646
- prompt = f"""
647
- You have the first pages of a PDF specification, including a table of contents.
648
- Instructions:
649
- 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
650
- 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
651
- 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
652
- 4. Output only valid JSON of the form:
653
- {{
654
- "Subtopic A": [start_page, end_page],
655
- "Subtopic B": [start_page, end_page]
656
- }}
657
- 5. If you can't find any subtopics, return an empty JSON.
658
- Important notes:
659
- - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
660
- - The final output must be valid JSON only, with no extra text or code blocks.
661
- Examples:
662
- 1. Given this table of contents:
663
- 1 Introduction – 2
664
- Why choose Edexcel A Level Mathematics? - 2
665
- Supporting you in planning and implementing this qualification - 3
666
- Qualification at a glance - 5
667
- 2 Subject content and assessment information – 7
668
- Paper 1 and Paper 2: Pure Mathematics - 11
669
- Paper 3: Statistics and Mechanics - 30
670
- Assessment Objectives - 40
671
- 3 Administration and general information – 42
672
- Entries - 42
673
- Access arrangements, reasonable adjustments, special consideration and malpractice - 42
674
- Student recruitment and progression - 45
675
- Appendix 1: Formulae – 49
676
- Appendix 2: Notation – 53
677
- Appendix 3: Use of calculators – 59
678
- Appendix 4: Assessment Objectives – 60
679
- Appendix 5: The context for the development of this qualification – 62
680
- Appendix 6: Transferable skills – 64
681
- Appendix 7: Level 3 Extended Project qualification – 65
682
- Appendix 8: Codes – 67
683
- The correct output should be:
684
- {{
685
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
686
- "Paper 3: Statistics and Mechanics": [30, 42]
687
- }}
688
- 2. Given this table of contents:
689
- Qualification at a glance – 1
690
- Assessment Objectives and weightings - 4
691
- Knowledge, skills and understanding – 5
692
- Theme 1: Introduction to markets and market failure - 5
693
- Theme 2: The UK economy – performance and policies - 11
694
- Theme 3: Business behaviour and the labour market - 21
695
- Theme 4: A global perspective - 29
696
- Assessment – 39
697
- Assessment summary - 39
698
- Assessment objectives - 41
699
- Assessment overview - 42
700
- Breakdown of assessment objectives - 42
701
- Synoptic assessment - 43
702
- Discount code and performance tables - 43
703
- Access arrangements, reasonable adjustments and special consideration - 44
704
- Malpractice - 45
705
- Equality Act 2010 and Pearson equality policy - 45
706
- Synoptic assessment - 46
707
- Awarding and reporting - 47
708
- Other information – 49
709
- Student recruitment -49
710
- Prior learning and other requirements -49
711
- Progression - 49
712
- Appendix 1: Transferable skills – 53
713
- Appendix 2: Level 3 Extended Project qualification – 55
714
- Appendix 3: Quantitative skills – 59
715
- Appendix 4: Codes – 61
716
- Appendix 5: Index – 63
717
- The correct output should be:
718
- {{
719
- "Theme 1: Introduction to markets and market failure": [5, 10],
720
- "Theme 2: The UK economy – performance and policies": [11, 20],
721
- "Theme 3: Business behaviour and the labour market": [21, 28],
722
- "Theme 4: A global perspective": [29, 38]
723
- }}
724
- 3. You might also see sections like:
725
- 2.1 AS Unit 1 11
726
- 2.2 AS Unit 2 18
727
- 2.3 A2 Unit 3 24
728
- 2.4 A2 Unit 4 31
729
- In that scenario, your output might look like:
730
- {{
731
- "2.1 AS Unit 1": [11, 17],
732
- "2.2 AS Unit 2": [18, 23],
733
- "2.3 A2 Unit 3": [24, 30],
734
- "2.4 A2 Unit 4": [31, 35]
735
- }}
736
- or
737
- 2.1 AS units 6
738
- 2.2 AS units 23
739
- In that scenario, your output might look like:
740
- {{
741
- "2.1 AS Unit 1": [6, 2],
742
- "2.2 AS Unit 2": [23, 43]
743
- }}
744
-
745
- 4. Another example might list subtopics:
746
- 3.1 Overarching themes 11
747
- 3.2 A: Proof 12
748
- 3.3 B: Algebra and functions 13
749
- 3.4 C: Coordinate geometry in the ( x , y ) plane 14
750
- 3.5 D: Sequences and series 15
751
- 3.6 E: Trigonometry 16
752
- 3.7 F: Exponentials and logarithms 17
753
- 3.8 G: Differentiation 18
754
- 3.9 H: Integration 19
755
- 3.10 I: Numerical methods 20
756
- 3.11 J: Vectors 20
757
- 3.12 K: Statistical sampling 21
758
- 3.13 L: Data presentation and interpretation 21
759
- 3.14 M: Probability 22
760
- 3.15 N: Statistical distributions 23
761
- 3.16 O: Statistical hypothesis testing 23
762
- 3.17 P: Quantities and units in mechanics 24
763
- 3.18 Q: Kinematics 24
764
- 3.19 R: Forces and Newton’s laws 24
765
- 3.20 S: Moments 25
766
- 3.21 Use of data in statistics 26
767
- Here the correct output might look like:
768
- {{
769
- "A: Proof": [12, 12],
770
- "B: Algebra and functions": [13, 13],
771
- ...
772
- }}
773
- Now, extract topics from this text:
774
- {first_pages_text}
775
- """
776
- global _GEMINI_CLIENT
777
- if _GEMINI_CLIENT is None:
778
- _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
779
- client = _GEMINI_CLIENT
780
- try:
781
- response = client.models.generate_content(
782
- model="gemini-2.0-flash",
783
- contents=[prompt],
784
- config=types.GenerateContentConfig(temperature=0.0)
785
- )
786
- if not response or not response.text:
787
- logger.warning("No text from LLM => returning empty subtopics.")
788
- return {}
789
- raw_json = response.text.strip()
790
- cleaned = raw_json.replace("```json", "").replace("```", "")
791
- try:
792
- data = json.loads(cleaned)
793
- except Exception as json_err:
794
- logger.error(f"JSON parsing error: {json_err}")
795
- return {}
796
- final_dict = {}
797
- found_sub_dict = None
798
- for k, v in data.items():
799
- if isinstance(v, dict):
800
- found_sub_dict = v
801
- break
802
- if found_sub_dict is not None:
803
- for subk, rng in found_sub_dict.items():
804
- if isinstance(rng, list) and len(rng) == 2:
805
- final_dict[subk] = rng
806
- else:
807
- for subk, rng in data.items():
808
- if isinstance(rng, list) and len(rng) == 2:
809
- final_dict[subk] = rng
810
- return final_dict
811
- except Exception as e:
812
- logger.error(f"Gemini subtopic extraction error: {e}")
813
- return {}
814
-
815
- def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
816
- text_parts = []
817
- try:
818
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
819
- response = requests.get(pdf_path)
820
- if response.status_code != 200:
821
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
822
- return ""
823
- pdf_bytes = response.content
824
- else:
825
- with open(pdf_path, "rb") as f:
826
- pdf_bytes = f.read()
827
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
828
- pages_to_read = min(num_pages, doc.page_count)
829
- for i in range(pages_to_read):
830
- raw_text = doc[i].get_text("raw")
831
- text_parts.append(raw_text)
832
- doc.close()
833
- except Exception as e:
834
- logger.error(f"Could not open PDF: {e}")
835
- return "\n".join(text_parts)
836
-
837
- class MineruNoTextProcessor:
838
- def __init__(self, output_folder: str, gemini_api_key: str):
839
- self.output_folder = output_folder
840
- os.makedirs(self.output_folder, exist_ok=True)
841
- self.layout_model = "doclayout_yolo"
842
- self.formula_enable = True
843
- self.table_enable = False
844
- self.language = "en"
845
-
846
- self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
847
- self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
848
-
849
- self.use_s3 = True
850
- self.s3_writer = s3Writer(
851
- ak=os.getenv("S3_ACCESS_KEY"),
852
- sk=os.getenv("S3_SECRET_KEY"),
853
- bucket="quextro-resources",
854
- endpoint_url=os.getenv("S3_ENDPOINT")
855
- )
856
-
857
- def cleanup_gpu(self):
858
- try:
859
- gc.collect()
860
- torch.cuda.empty_cache()
861
- logger.info("GPU memory cleaned up.")
862
- except Exception as e:
863
- logger.error(f"Error during GPU cleanup: {e}")
864
-
865
- def process(self, pdf_path: str) -> Dict[str, Any]:
866
- logger.info(f"Processing PDF: {pdf_path}")
867
- try:
868
- # Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
869
- subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
870
- logger.info(f"Gemini returned subtopics: {subtopics}")
871
-
872
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
873
- response = requests.get(pdf_path)
874
- if response.status_code != 200:
875
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
876
- raise Exception(f"Failed to download PDF: {pdf_path}")
877
- pdf_bytes = response.content
878
- logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
879
- else:
880
- with open(pdf_path, "rb") as f:
881
- pdf_bytes = f.read()
882
- logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
883
-
884
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
885
- total_pages = doc.page_count
886
- doc.close()
887
-
888
- # Decide which pages to process
889
- final_pages = set()
890
- if not subtopics:
891
- # fallback
892
- final_pages = set(range(total_pages))
893
- else:
894
- offset_candidates = []
895
- for subname, rng in subtopics.items():
896
- start_p, _ = rng
897
- occs = find_all_occurrences(pdf_bytes, subname)
898
- for p in occs:
899
- candidate = p - (start_p - 1)
900
- if candidate > 0:
901
- offset_candidates.append(candidate)
902
- if offset_candidates:
903
- try:
904
- from statistics import mode
905
- global_offset = mode(offset_candidates)
906
- except:
907
- from statistics import median
908
- global_offset = int(median(offset_candidates))
909
- else:
910
- global_offset = 0
911
-
912
- logger.info(f"Computed global offset: {global_offset}")
913
- for subname, rng in subtopics.items():
914
- if not (isinstance(rng, list) and len(rng) == 2):
915
- continue
916
- start_p, end_p = rng
917
- if start_p > end_p:
918
- continue
919
- s0 = (start_p - 1) + global_offset
920
- e0 = (end_p - 1) + global_offset
921
- for pp in range(s0, e0 + 1):
922
- final_pages.add(pp)
923
-
924
- if not final_pages:
925
- final_pages = set(range(total_pages))
926
-
927
- logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
928
- subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
929
-
930
- # 4) Analyze and produce markdown
931
- dataset = PymuDocDataset(subset_pdf_bytes)
932
- inference = doc_analyze(
933
- dataset,
934
- ocr=True,
935
- lang=self.language,
936
- layout_model=self.layout_model,
937
- formula_enable=self.formula_enable,
938
- table_enable=self.table_enable
939
- )
940
- #S3
941
- writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
942
-
943
- md_prefix = "/topic-extraction/"
944
- pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
945
- md_content = pipe_result.get_markdown(md_prefix)
946
- final_markdown = writer.post_process(md_prefix, md_content)
947
-
948
- subtopic_list = list(writer.extracted_subtopics.values())
949
- subtopic_list = merge_topics(subtopic_list)
950
-
951
- out_path = os.path.join(self.output_folder, "_subtopics.json")
952
- with open(out_path, "w", encoding="utf-8") as f:
953
- json.dump(subtopic_list, f, indent=2)
954
- logger.info(f"Final subtopics JSON saved locally at {out_path}")
955
-
956
- return {
957
- "final_markdown": final_markdown,
958
- "subtopics_extracted": subtopic_list
959
- }
960
- finally:
961
- self.cleanup_gpu()
962
-
963
- if __name__ == "__main__":
964
- input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
965
- output_dir = "/home/user/app/pearson_json"
966
- gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
967
- try:
968
- processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
969
- result = processor.process(input_pdf)
970
- logger.info("Processing completed successfully.")
971
- except Exception as e:
972
- logger.error(f"Processing failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
topic_extract_arsenii.py DELETED
@@ -1,860 +0,0 @@
1
- #!/usr/bin/env python3
2
- import os
3
- import re
4
- import gc
5
- import json
6
- import logging
7
- import fitz
8
- import boto3
9
- import base64
10
- import time
11
- import asyncio
12
- import tempfile
13
- import requests
14
- from io import BytesIO
15
- from typing import List, Dict, Any
16
-
17
- import torch
18
- import cv2
19
- import numpy as np
20
-
21
- from google import genai
22
- from google.genai import types
23
-
24
- from magic_pdf.data.dataset import PymuDocDataset
25
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
26
- from magic_pdf.data.data_reader_writer.base import DataWriter
27
- from table_row_extraction import TableExtractor
28
-
29
- logging.basicConfig(level=logging.INFO)
30
- logger = logging.getLogger(__name__)
31
- logger.setLevel(logging.INFO)
32
- file_handler = logging.FileHandler("topic_extraction_ars.log")
33
- file_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s - %(message)s"))
34
- logger.addHandler(file_handler)
35
-
36
- _GEMINI_CLIENT = None
37
-
38
- def unify_whitespace(text: str) -> str:
39
- return re.sub(r"\s+", " ", text).strip()
40
-
41
- def find_all_occurrences(pdf_bytes: bytes, search_text: str) -> List[int]:
42
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
43
- st_norm = unify_whitespace(search_text)
44
- found = []
45
- for i in range(doc.page_count):
46
- raw = doc[i].get_text("raw")
47
- norm = unify_whitespace(raw)
48
- if st_norm in norm:
49
- found.append(i)
50
- doc.close()
51
- return sorted(found)
52
-
53
- def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> bytes:
54
- if not page_indices:
55
- raise ValueError("No page indices provided for subset creation.")
56
- doc = fitz.open(stream=original_pdf_bytes, filetype="pdf")
57
- new_doc = fitz.open()
58
- for p in sorted(set(page_indices)):
59
- if 0 <= p < doc.page_count:
60
- new_doc.insert_pdf(doc, from_page=p, to_page=p)
61
- else:
62
- logger.error(f"Page index {p} out of range (0..{doc.page_count - 1}).")
63
- raise ValueError(f"Page index {p} out of range.")
64
- subset_bytes = new_doc.tobytes()
65
- new_doc.close()
66
- doc.close()
67
- return subset_bytes
68
-
69
- class s3Writer:
70
- def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
71
- self.bucket = bucket
72
- self.client = boto3.client(
73
- 's3',
74
- aws_access_key_id=ak,
75
- aws_secret_access_key=sk,
76
- endpoint_url=endpoint_url
77
- )
78
-
79
- def write(self, path: str, data: bytes) -> None:
80
- try:
81
- file_obj = BytesIO(data)
82
- self.client.upload_fileobj(
83
- file_obj,
84
- self.bucket,
85
- path
86
- )
87
- logger.info(f"Uploaded to S3: {path}")
88
- except Exception as e:
89
- logger.error(f"Failed to upload to S3: {str(e)}")
90
- raise
91
-
92
- def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
93
- arr = np.frombuffer(image_data, np.uint8)
94
- img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
95
- if img is not None:
96
- h, w, _ = img.shape
97
- if max(h, w) > max_dim:
98
- scale = max_dim / float(max(h, w))
99
- new_w = int(w * scale)
100
- new_h = int(h * scale)
101
- img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
102
- encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
103
- success, enc = cv2.imencode(".jpg", img, encode_params)
104
- if success:
105
- return enc.tobytes()
106
- return image_data
107
-
108
- def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
109
- """
110
- Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
111
- """
112
- for attempt in range(max_retries + 1):
113
- try:
114
- prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
115
- The three-column 'table' image includes such key features:
116
- - Three columns header
117
- - Headers like 'Topics', 'Content', 'Guidelines'
118
- - Possibly sections (e.g. 8.4, 9.1)
119
- The two-column 'table' image includes such key features:
120
- - Two columns
121
- - Headers like 'Subject content' and 'Additional information'
122
- - Possibly sections (e.g. 2.1, 3.4)
123
- If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
124
- If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
125
- If the image does not show a table at all, respond with 'NO_TABLE'.
126
- Return only one of these exact labels.
127
- """
128
- global _GEMINI_CLIENT
129
- if _GEMINI_CLIENT is None:
130
- _GEMINI_CLIENT = genai.Client(api_key=api_key)
131
- client = _GEMINI_CLIENT
132
-
133
- resp = client.models.generate_content(
134
- model="gemini-2.0-flash",
135
- contents=[
136
- {
137
- "parts": [
138
- {"text": prompt},
139
- {
140
- "inline_data": {
141
- "mime_type": "image/jpeg",
142
- "data": base64.b64encode(image_data).decode('utf-8')
143
- }
144
- }
145
- ]
146
- }
147
- ],
148
- config=types.GenerateContentConfig(temperature=0.0)
149
- )
150
- if resp and resp.text:
151
- classification = resp.text.strip().upper()
152
- if "THREE" in classification:
153
- return "THREE_COLUMN"
154
- elif "TWO" in classification:
155
- return "TWO_COLUMN"
156
- return "NO_TABLE"
157
- except Exception as e:
158
- logger.error(f"Gemini table classification error: {e}")
159
- if "503" in str(e):
160
- return "NO_TABLE"
161
- if attempt < max_retries:
162
- time.sleep(0.5)
163
- else:
164
- return "NO_TABLE"
165
-
166
- async def classify_image_async(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
167
- loop = asyncio.get_event_loop()
168
- preprocessed = preprocess_image(image_data)
169
- return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
170
-
171
-
172
- def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
173
- """
174
- Sends the *image* (not text) of a table cell to Gemini to identify:
175
- - A main topic heading in the format: "<number> <Topic Name>", e.g. "2 Algebra and functions"
176
- - A subtopic heading in the format: "<number>.<number>", e.g. "2.5", "3.4"
177
- Returns a dict of the form:
178
- {
179
- "title": "<the recognized main topic or empty if not found>",
180
- "subtopics": ["2.5", "2.6", ...]
181
- }
182
- """
183
- # Prompt specifically instructs Gemini to read the image’s text and extract
184
- # either a main topic or subtopic heading if present:
185
- prompt = """
186
- You are given an image of a table cell from an educational curriculum specification.
187
- The text in this cell may contain:
188
- 1) A main topic heading in the format "<number> <Topic Name>", for example: "2 Algebra and functions"
189
- 2) A subtopic heading in the format "<number>.<number>", for example: "2.5" or "3.4"
190
- Identify if the cell contains exactly one main topic or subtopic.
191
- Return a valid JSON object with the keys "title" and "subtopics" of the form:
192
- {{
193
- "title": "2 Algebra and functions",
194
- "subtopics": ["2.5", "2.6"]
195
- }}
196
- If you find a main topic (like '2 Algebra and functions'), put it in "title".
197
- If you find subtopic numbers (like '2.5', '3.4'), put them in the "subtopics" array.
198
- """
199
- # Re-use or initialize your global Gemini client:
200
- client = genai.Client(api_key=api_key)
201
- # Send the prompt + image to Gemini:
202
- resp = client.models.generate_content(
203
- model="gemini-2.0-flash",
204
- contents=[
205
- {
206
- "parts": [
207
- {"text": prompt},
208
- {
209
- "inline_data": {
210
- "mime_type": "image/jpeg",
211
- "data": base64.b64encode(image_data).decode("utf-8")
212
- }
213
- }
214
- ]
215
- }
216
- ],
217
- config=types.GenerateContentConfig(temperature=0.0)
218
- )
219
- raw = resp.text.strip().replace("```json", "").replace("```", "")
220
- logger.info(f"== RAW == {raw}")
221
- print(f"== RAW == {raw}")
222
-
223
- data = json.loads(raw)
224
- title = data["title"]
225
- subtopics = data["subtopics"]
226
- return {"title": title, "subtopics": subtopics}
227
-
228
-
229
- class S3ImageWriter(DataWriter):
230
- def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
231
- self.s3_writer = s3_writer
232
- self.base_path = base_path if base_path.endswith("/") else base_path + "/"
233
- self.gemini_api_key = gemini_api_key
234
- self.descriptions = {}
235
- self._img_count = 0
236
- self.extracted_tables = {}
237
- # New attribute to store final subtopic JSON
238
- self.extracted_subtopics = {}
239
-
240
- def write(self, path: str, data: bytes) -> None:
241
- self._img_count += 1
242
- unique_id = f"img_{self._img_count}.jpg"
243
- s3_key = f"{self.base_path}{unique_id}"
244
- self.s3_writer.write(s3_key, data)
245
- self.descriptions[path] = {
246
- "data": data,
247
- "s3_path": s3_key,
248
- "table_classification": "NO_TABLE",
249
- "final_alt": ""
250
- }
251
-
252
- async def post_process_async(self, key: str, md_content: str) -> str:
253
- logger.info("Classifying images to detect tables.")
254
- tasks = {
255
- p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
256
- for p, info in self.descriptions.items()
257
- }
258
- results = await asyncio.gather(*tasks.values(), return_exceptions=True)
259
- for p, result in zip(tasks.keys(), results):
260
- if isinstance(result, Exception):
261
- logger.error(f"Table classification error for {p}: {result}")
262
- self.descriptions[p]['table_classification'] = "NO_TABLE"
263
- else:
264
- self.descriptions[p]['table_classification'] = result
265
-
266
- # 2) Replace the original markdown references with alt text
267
- for p, info in self.descriptions.items():
268
- cls = info['table_classification']
269
- if cls == "TWO_COLUMN":
270
- info['final_alt'] = "HAS TO BE PROCESSED - two column table"
271
- elif cls == "THREE_COLUMN":
272
- info['final_alt'] = "HAS TO BE PROCESSED - three column table"
273
- else:
274
- info['final_alt'] = "NO_TABLE image"
275
- md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
276
-
277
- md_content = await self._process_table_images_in_markdown(key, md_content)
278
-
279
- # Filter final lines to keep only lines with images
280
- final_lines = [
281
- line.strip() for line in md_content.split("\n")
282
- if re.match(r"^\!\[.*\]\(.*\)", line.strip())
283
- ]
284
- return "\n".join(final_lines)
285
-
286
-
287
- async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
288
- pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
289
- matches = re.findall(pat, md_content, flags=re.IGNORECASE)
290
- if not matches:
291
- return md_content
292
-
293
- for (col_type, s3_key) in matches:
294
- logger.info(f"Processing table image: {s3_key}, columns={col_type}")
295
- img_data = None
296
- for desc in self.descriptions.values():
297
- if desc.get("s3_path") == s3_key:
298
- img_data = desc.get("data")
299
- break
300
- if img_data is None:
301
- logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
302
- continue
303
-
304
- with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
305
- temp_file.write(img_data)
306
- temp_path = temp_file.name
307
-
308
- try:
309
- if col_type.lower() == 'two':
310
- extractor = TableExtractor(
311
- skip_header=True,
312
- merge_two_col_rows=True,
313
- enable_subtopic_merge=True,
314
- subtopic_threshold=0.2
315
- )
316
- else:
317
- extractor = TableExtractor(
318
- skip_header=True,
319
- merge_two_col_rows=False,
320
- enable_subtopic_merge=False,
321
- subtopic_threshold=0.2
322
- )
323
- row_boxes = extractor.process_image(temp_path)
324
-
325
- #save cell images to S3 or local
326
- out_folder = temp_path + "_rows"
327
- os.makedirs(out_folder, exist_ok=True)
328
- extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
329
-
330
- recognized_main_topic = None
331
- recognized_subtopics = []
332
-
333
- for i, row in enumerate(row_boxes):
334
- row_dir = os.path.join(out_folder, f"row_{i}")
335
- for j, _ in enumerate(row):
336
- cell_path = os.path.join(row_dir, f"col_{j}.jpg")
337
- # if not os.path.isfile(cell_path):
338
- # continue
339
-
340
- with open(cell_path, "rb") as cf:
341
- cell_image_data = cf.read()
342
-
343
- # store that cell image to S3
344
- cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg"
345
- self.s3_writer.write(cell_key, cell_image_data)
346
-
347
- # Call Gemini with the cell image
348
- info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
349
- logger.debug(f"== INFO == {info}")
350
- # e.g. info = {"title": "2 Algebra and functions", "subtopics": ["2.5"]}
351
-
352
- # 3d) Merge the recognized topic/subtopics
353
- if info["title"]:
354
- recognized_main_topic = info["title"]
355
- if info["subtopics"]:
356
- recognized_subtopics.extend(info["subtopics"])
357
-
358
- snippet = ["**Extracted table cells:**"]
359
- cell_texts = []
360
- for i, row in enumerate(row_boxes):
361
- for j, box in enumerate(row):
362
- cell_key = f"{self.base_path}cells/table_s3_{os.path.basename(s3_key)}_r{i}_c{j}.jpg"
363
- self.s3_writer.write(cell_key, img_data) # or cell_data if you truly cropped
364
-
365
- text = "..." # placeholder
366
- cell_texts.append(text)
367
-
368
- snippet.append(f"![Row {i} Col {j}]({cell_key})")
369
-
370
- final_json = {
371
- "title": recognized_main_topic,
372
- "contents": [
373
- {
374
- "type": "image",
375
- "key": s3_key
376
- }
377
- ],
378
- "children": []
379
- }
380
-
381
- for st in recognized_subtopics:
382
- final_json["children"].append({
383
- "title": st,
384
- "contents": [
385
- {"type": "image", "key": f"subtopic_{st}_example.jpg"}
386
- ]
387
- })
388
-
389
- self.extracted_subtopics[s3_key] = final_json
390
-
391
- # Replace the original table image line in the markdown with the snippet
392
- new_snip = "\n".join(snippet)
393
- old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
394
- md_content = md_content.replace(old_line, new_snip)
395
-
396
- snippet = ["**Extracted table cells:**"]
397
- for i, row in enumerate(row_boxes):
398
- for j, _ in enumerate(row):
399
- snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
400
- new_snip = "\n".join(snippet)
401
- old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
402
- md_content = md_content.replace(old_line, new_snip)
403
-
404
- except Exception as e:
405
- logger.error(f"Error processing table image {s3_key}: {e}")
406
- finally:
407
- os.remove(temp_path)
408
-
409
- return md_content
410
-
411
- def post_process(self, key: str, md_content: str) -> str:
412
- return asyncio.run(self.post_process_async(key, md_content))
413
-
414
-
415
- class LocalImageWriter(DataWriter):
416
- def __init__(self, output_folder: str, gemini_api_key: str):
417
- self.output_folder = output_folder
418
- os.makedirs(self.output_folder, exist_ok=True)
419
- self.descriptions = {}
420
- self._img_count = 0
421
- self.gemini_api_key = gemini_api_key
422
-
423
- self.extracted_tables = {}
424
-
425
- def write(self, path: str, data: bytes) -> None:
426
- self._img_count += 1
427
- unique_id = f"img_{self._img_count}.jpg"
428
- self.descriptions[path] = {
429
- "data": data,
430
- "relative_path": unique_id,
431
- "table_classification": "NO_TABLE",
432
- "final_alt": ""
433
- }
434
- # Also save the original image locally for testing.
435
- image_path = os.path.join(self.output_folder, unique_id)
436
- with open(image_path, "wb") as f:
437
- f.write(data)
438
-
439
- async def post_process_async(self, key: str, md_content: str) -> str:
440
- logger.info("Classifying images to detect tables.")
441
- tasks = []
442
- for p, info in self.descriptions.items():
443
- tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
444
- for p, task in tasks:
445
- try:
446
- classification = await task
447
- self.descriptions[p]['table_classification'] = classification
448
- except Exception as e:
449
- logger.error(f"Table classification error: {e}")
450
- self.descriptions[p]['table_classification'] = "NO_TABLE"
451
- for p, info in self.descriptions.items():
452
- cls = info['table_classification']
453
- if cls == "TWO_COLUMN":
454
- info['final_alt'] = "HAS TO BE PROCESSED - two column table"
455
- elif cls == "THREE_COLUMN":
456
- info['final_alt'] = "HAS TO BE PROCESSED - three column table"
457
- else:
458
- info['final_alt'] = "NO_TABLE image"
459
- md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['relative_path']})")
460
- md_content = self._process_table_images_in_markdown(md_content)
461
- final_lines = []
462
- for line in md_content.split("\n"):
463
- if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
464
- final_lines.append(line.strip())
465
- return "\n".join(final_lines)
466
-
467
- def _process_table_images_in_markdown(self, md_content: str) -> str:
468
- pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
469
- matches = re.findall(pat, md_content, flags=re.IGNORECASE)
470
- if not matches:
471
- return md_content
472
- for (col_type, image_id) in matches:
473
- logger.info(f"Processing table image => {image_id}, columns={col_type}")
474
- temp_path = os.path.join(self.output_folder, image_id)
475
- desc_item = None
476
- for k, val in self.descriptions.items():
477
- if val["relative_path"] == image_id:
478
- desc_item = val
479
- break
480
- if not desc_item:
481
- logger.warning(f"No matching image data for {image_id}, skipping extraction.")
482
- continue
483
- if not os.path.exists(temp_path):
484
- with open(temp_path, "wb") as f:
485
- f.write(desc_item["data"])
486
- try:
487
- if col_type.lower() == 'two': #check for table_row_extr script for more details
488
- extractor = TableExtractor(
489
- skip_header=True,
490
- merge_two_col_rows=True,
491
- enable_subtopic_merge=True,
492
- subtopic_threshold=0.2
493
- )
494
- else:
495
- extractor = TableExtractor(
496
- skip_header=True,
497
- merge_two_col_rows=False,
498
- enable_subtopic_merge=False,
499
- subtopic_threshold=0.2
500
- )
501
- row_boxes = extractor.process_image(temp_path)
502
- out_folder = temp_path + "_rows"
503
- os.makedirs(out_folder, exist_ok=True)
504
- extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
505
- # List all extracted cell images relative to the output folder.
506
- extracted_cells = []
507
- for root, dirs, files in os.walk(out_folder):
508
- for file in files:
509
- rel_path = os.path.relpath(os.path.join(root, file), self.output_folder)
510
- extracted_cells.append(rel_path)
511
- # Save mapping for testing.
512
- self.extracted_tables[image_id] = extracted_cells
513
- snippet = ["**Extracted table cells:**"]
514
- for i, row in enumerate(row_boxes):
515
- row_dir = os.path.join(out_folder, f"row_{i}")
516
- for j, _ in enumerate(row):
517
- cell_file = f"col_{j}.jpg"
518
- cell_path = os.path.join(row_dir, cell_file)
519
- relp = os.path.relpath(cell_path, self.output_folder)
520
- snippet.append(f"![Row {i} Col {j}]({relp})")
521
- new_snip = "\n".join(snippet)
522
- old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
523
- md_content = md_content.replace(old_line, new_snip)
524
- except Exception as e:
525
- logger.error(f"Error processing table image {image_id}: {e}")
526
- finally:
527
- if os.path.exists(temp_path):
528
- os.remove(temp_path)
529
- return md_content
530
-
531
- def post_process(self, key: str, md_content: str) -> str:
532
- return asyncio.run(self.post_process_async(key, md_content))
533
-
534
- class GeminiTopicExtractor:
535
- def __init__(self, api_key: str = None, num_pages: int = 14):
536
- self.api_key = api_key or os.getenv("GEMINI_API_KEY", "")
537
- self.num_pages = num_pages
538
-
539
- def extract_subtopics(self, pdf_path: str) -> Dict[str, List[int]]:
540
- first_pages_text = self._read_first_pages_raw(pdf_path, self.num_pages)
541
- if not first_pages_text.strip():
542
- logger.error("No text from first pages => cannot extract subtopics.")
543
- return {}
544
- prompt = f"""
545
- You have the first pages of a PDF specification, including a table of contents.
546
- Instructions:
547
- 1. Identify the 'Contents' section listing all topics, subtopics, and their corresponding pages.
548
- 2. Identify the major academic subtopics (common desired topic names "Paper X", "Theme X", "Content of X", "AS Unit X", "A2 Unit X", or similar headings).
549
- 3. For each subtopic, give the range of pages [start_page, end_page] (1-based) from the table of contents.
550
- 4. Output only valid JSON of the form:
551
- {{
552
- "Subtopic A": [start_page, end_page],
553
- "Subtopic B": [start_page, end_page]
554
- }}
555
- 5. If you can't find any subtopics, return an empty JSON.
556
- Important notes:
557
- - The correct "end_page" must be the page number of the next topic or subtopic minus 1.
558
- - The final output must be valid JSON only, with no extra text or code blocks.
559
- Examples:
560
- 1. Given this table of contents:
561
- 1 Introduction – 2
562
- Why choose Edexcel A Level Mathematics? - 2
563
- Supporting you in planning and implementing this qualification - 3
564
- Qualification at a glance - 5
565
- 2 Subject content and assessment information – 7
566
- Paper 1 and Paper 2: Pure Mathematics - 11
567
- Paper 3: Statistics and Mechanics - 30
568
- Assessment Objectives - 40
569
- 3 Administration and general information – 42
570
- Entries - 42
571
- Access arrangements, reasonable adjustments, special consideration and malpractice - 42
572
- Student recruitment and progression - 45
573
- Appendix 1: Formulae – 49
574
- Appendix 2: Notation – 53
575
- Appendix 3: Use of calculators – 59
576
- Appendix 4: Assessment Objectives – 60
577
- Appendix 5: The context for the development of this qualification – 62
578
- Appendix 6: Transferable skills – 64
579
- Appendix 7: Level 3 Extended Project qualification – 65
580
- Appendix 8: Codes – 67
581
- The correct output should be:
582
- {{
583
- "Paper 1 and Paper 2: Pure Mathematics": [11, 29],
584
- "Paper 3: Statistics and Mechanics": [30, 42]
585
- }}
586
- 2. Given this table of contents:
587
- Qualification at a glance – 1
588
- Assessment Objectives and weightings - 4
589
- Knowledge, skills and understanding – 5
590
- Theme 1: Introduction to markets and market failure - 5
591
- Theme 2: The UK economy – performance and policies - 11
592
- Theme 3: Business behaviour and the labour market - 21
593
- Theme 4: A global perspective - 29
594
- Assessment – 39
595
- Assessment summary - 39
596
- Assessment objectives - 41
597
- Assessment overview - 42
598
- Breakdown of assessment objectives - 42
599
- Synoptic assessment - 43
600
- Discount code and performance tables - 43
601
- Access arrangements, reasonable adjustments and special consideration - 44
602
- Malpractice - 45
603
- Equality Act 2010 and Pearson equality policy - 45
604
- Synoptic assessment - 46
605
- Awarding and reporting - 47
606
- Other information – 49
607
- Student recruitment -49
608
- Prior learning and other requirements -49
609
- Progression - 49
610
- Appendix 1: Transferable skills – 53
611
- Appendix 2: Level 3 Extended Project qualification – 55
612
- Appendix 3: Quantitative skills – 59
613
- Appendix 4: Codes – 61
614
- Appendix 5: Index – 63
615
- The correct output should be:
616
- {{
617
- "Theme 1: Introduction to markets and market failure": [5, 10],
618
- "Theme 2: The UK economy – performance and policies": [11, 20],
619
- "Theme 3: Business behaviour and the labour market": [21, 28],
620
- "Theme 4: A global perspective": [29, 38]
621
- }}
622
- 3. You might also see sections like:
623
- 2.1 AS Unit 1 11
624
- 2.2 AS Unit 2 18
625
- 2.3 A2 Unit 3 24
626
- 2.4 A2 Unit 4 31
627
- In that scenario, your output might look like:
628
- {{
629
- "2.1 AS Unit 1": [11, 17],
630
- "2.2 AS Unit 2": [18, 23],
631
- "2.3 A2 Unit 3": [24, 30],
632
- "2.4 A2 Unit 4": [31, 35]
633
- }}
634
- 4. Another example might list subtopics:
635
- 3.1 Overarching themes 11
636
- 3.2 A: Proof 12
637
- 3.3 B: Algebra and functions 13
638
- 3.4 C: Coordinate geometry in the ( x , y ) plane 14
639
- 3.5 D: Sequences and series 15
640
- 3.6 E: Trigonometry 16
641
- 3.7 F: Exponentials and logarithms 17
642
- 3.8 G: Differentiation 18
643
- 3.9 H: Integration 19
644
- 3.10 I: Numerical methods 20
645
- 3.11 J: Vectors 20
646
- 3.12 K: Statistical sampling 21
647
- 3.13 L: Data presentation and interpretation 21
648
- 3.14 M: Probability 22
649
- 3.15 N: Statistical distributions 23
650
- 3.16 O: Statistical hypothesis testing 23
651
- 3.17 P: Quantities and units in mechanics 24
652
- 3.18 Q: Kinematics 24
653
- 3.19 R: Forces and Newton’s laws 24
654
- 3.20 S: Moments 25
655
- 3.21 Use of data in statistics 26
656
- Here the correct output might look like:
657
- {{
658
- "A: Proof": [12, 12],
659
- "B: Algebra and functions": [13, 13],
660
- ...
661
- }}
662
- Now, extract topics from this text:
663
- {first_pages_text}
664
- """
665
- global _GEMINI_CLIENT
666
- if _GEMINI_CLIENT is None:
667
- _GEMINI_CLIENT = genai.Client(api_key=self.api_key)
668
- client = _GEMINI_CLIENT
669
- try:
670
- response = client.models.generate_content(
671
- model="gemini-2.0-flash",
672
- contents=[prompt],
673
- config=types.GenerateContentConfig(temperature=0.0)
674
- )
675
- if not response or not response.text:
676
- logger.warning("No text from LLM => returning empty subtopics.")
677
- return {}
678
- raw_json = response.text.strip()
679
- cleaned = raw_json.replace("```json", "").replace("```", "")
680
- try:
681
- data = json.loads(cleaned)
682
- except Exception as json_err:
683
- logger.error(f"JSON parsing error: {json_err}")
684
- return {}
685
- final_dict = {}
686
- found_sub_dict = None
687
- for k, v in data.items():
688
- if isinstance(v, dict):
689
- found_sub_dict = v
690
- break
691
- if found_sub_dict is not None:
692
- for subk, rng in found_sub_dict.items():
693
- if isinstance(rng, list) and len(rng) == 2:
694
- final_dict[subk] = rng
695
- else:
696
- for subk, rng in data.items():
697
- if isinstance(rng, list) and len(rng) == 2:
698
- final_dict[subk] = rng
699
- return final_dict
700
- except Exception as e:
701
- logger.error(f"Gemini subtopic extraction error: {e}")
702
- return {}
703
-
704
- def _read_first_pages_raw(self, pdf_path: str, num_pages: int) -> str:
705
- text_parts = []
706
- try:
707
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
708
- response = requests.get(pdf_path)
709
- if response.status_code != 200:
710
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
711
- return ""
712
- pdf_bytes = response.content
713
- else:
714
- with open(pdf_path, "rb") as f:
715
- pdf_bytes = f.read()
716
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
717
- pages_to_read = min(num_pages, doc.page_count)
718
- for i in range(pages_to_read):
719
- raw_text = doc[i].get_text("raw")
720
- text_parts.append(raw_text)
721
- doc.close()
722
- except Exception as e:
723
- logger.error(f"Could not open PDF: {e}")
724
- return "\n".join(text_parts)
725
-
726
-
727
- class MineruNoTextProcessor:
728
- def __init__(self, output_folder: str, gemini_api_key: str):
729
- self.output_folder = output_folder
730
- os.makedirs(self.output_folder, exist_ok=True)
731
- self.layout_model = "doclayout_yolo"
732
- self.formula_enable = True
733
- self.table_enable = False
734
- self.language = "en"
735
-
736
- self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
737
- self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
738
-
739
- self.use_s3 = True
740
- self.s3_writer = s3Writer(
741
- ak=os.getenv("S3_ACCESS_KEY"),
742
- sk=os.getenv("S3_SECRET_KEY"),
743
- bucket="quextro-resources",
744
- endpoint_url=os.getenv("S3_ENDPOINT")
745
- )
746
-
747
- def cleanup_gpu(self):
748
- try:
749
- gc.collect()
750
- torch.cuda.empty_cache()
751
- logger.info("GPU memory cleaned up.")
752
- except Exception as e:
753
- logger.error(f"Error during GPU cleanup: {e}")
754
-
755
- def process(self, pdf_path: str) -> Dict[str, Any]:
756
- logger.info(f"Processing PDF: {pdf_path}")
757
- try:
758
- # 1) Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
759
- subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
760
- logger.info(f"Gemini returned subtopics: {subtopics}")
761
-
762
- if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
763
- response = requests.get(pdf_path)
764
- if response.status_code != 200:
765
- logger.error("Failed to download PDF from %s. Status code: %d", pdf_path, response.status_code)
766
- raise Exception(f"Failed to download PDF: {pdf_path}")
767
- pdf_bytes = response.content
768
- logger.info("Downloaded %d bytes for pdf_url='%s'", len(pdf_bytes), pdf_path)
769
- else:
770
- with open(pdf_path, "rb") as f:
771
- pdf_bytes = f.read()
772
- logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
773
-
774
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
775
- total_pages = doc.page_count
776
- doc.close()
777
-
778
- # 3) Decide which pages to process
779
- final_pages = set()
780
- if not subtopics:
781
- # fallback
782
- final_pages = set(range(total_pages))
783
- else:
784
- offset_candidates = []
785
- for subname, rng in subtopics.items():
786
- start_p, _ = rng
787
- occs = find_all_occurrences(pdf_bytes, subname)
788
- for p in occs:
789
- candidate = p - (start_p - 1)
790
- if candidate > 0:
791
- offset_candidates.append(candidate)
792
- if offset_candidates:
793
- try:
794
- from statistics import mode
795
- global_offset = mode(offset_candidates)
796
- except:
797
- from statistics import median
798
- global_offset = int(median(offset_candidates))
799
- else:
800
- global_offset = 0
801
-
802
- logger.info(f"Computed global offset: {global_offset}")
803
- for subname, rng in subtopics.items():
804
- if not (isinstance(rng, list) and len(rng) == 2):
805
- continue
806
- start_p, end_p = rng
807
- if start_p > end_p:
808
- continue
809
- s0 = (start_p - 1) + global_offset
810
- e0 = (end_p - 1) + global_offset
811
- for pp in range(s0, e0 + 1):
812
- final_pages.add(pp)
813
-
814
- if not final_pages:
815
- final_pages = set(range(total_pages))
816
-
817
- logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
818
- subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
819
-
820
- # 4) Analyze and produce markdown
821
- dataset = PymuDocDataset(subset_pdf_bytes)
822
- inference = doc_analyze(
823
- dataset,
824
- ocr=True,
825
- lang=self.language,
826
- layout_model=self.layout_model,
827
- formula_enable=self.formula_enable,
828
- table_enable=self.table_enable
829
- )
830
- writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
831
- md_prefix = "/topic-extraction/"
832
- pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
833
- md_content = pipe_result.get_markdown(md_prefix)
834
- final_markdown = writer.post_process(md_prefix, md_content)
835
-
836
- subtopic_list = list(writer.extracted_subtopics.values())
837
-
838
- out_path = os.path.join(self.output_folder, "final_subtopics.json")
839
- with open(out_path, "w", encoding="utf-8") as f:
840
- json.dump(subtopic_list, f, indent=2)
841
- logger.info(f"Final subtopics JSON saved locally at {out_path}")
842
-
843
- return {
844
- "final_markdown": final_markdown,
845
- "subtopics_extracted": subtopic_list
846
- }
847
- finally:
848
- self.cleanup_gpu()
849
-
850
- if __name__ == "__main__":
851
- input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
852
- output_dir = "/home/user/app/we/we_ars"
853
- gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
854
- try:
855
- processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
856
- result = processor.process(input_pdf)
857
- logger.info("Processing completed successfully.")
858
- # The result includes final_markdown and subtopics_extracted
859
- except Exception as e:
860
- logger.error(f"Processing failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
topic_extraction.py CHANGED
@@ -35,6 +35,7 @@ logger.addHandler(file_handler)
35
 
36
  _GEMINI_CLIENT = None
37
 
 
38
  def unify_whitespace(text: str) -> str:
39
  return re.sub(r"\s+", " ", text).strip()
40
 
@@ -66,6 +67,123 @@ def create_subset_pdf(original_pdf_bytes: bytes, page_indices: List[int]) -> byt
66
  doc.close()
67
  return subset_bytes
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  class s3Writer:
70
  def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
71
  self.bucket = bucket
@@ -77,7 +195,6 @@ class s3Writer:
77
  )
78
 
79
  def write(self, path: str, data: bytes) -> None:
80
- """Upload data to S3 using proper keyword arguments"""
81
  try:
82
  file_obj = BytesIO(data)
83
  self.client.upload_fileobj(
@@ -90,6 +207,13 @@ class s3Writer:
90
  logger.error(f"Failed to upload to S3: {str(e)}")
91
  raise
92
 
 
 
 
 
 
 
 
93
  def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
94
  arr = np.frombuffer(image_data, np.uint8)
95
  img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
@@ -107,27 +231,30 @@ def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -
107
  return image_data
108
 
109
  def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
 
 
 
110
  for attempt in range(max_retries + 1):
111
  try:
112
  prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
113
- The three-column 'table' image include such key features:
114
- - Three columns header columns
115
- - Headers like 'Topics', 'Content', 'Guidelines'
116
- - Numbered sections (e.g., 8.4, 9.1)
117
- - Educational curriculum-style structure
118
- The two-column 'table' image include such key features:
119
- - Two columns header columns
120
- - Headers like 'Subject content' and 'Additional information'
121
- - Numbered sections (e.g., 2.1, 3.4)
122
- - Educational curriculum-style structure
123
- - Bullet description in 'Additional information'
124
  If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
125
  If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
126
- If the image does not show a table at all, respond with 'NO_TABLE'.
127
  Return only one of these exact labels.
128
  """
129
  global _GEMINI_CLIENT
 
 
130
  client = _GEMINI_CLIENT
 
131
  resp = client.models.generate_content(
132
  model="gemini-2.0-flash",
133
  contents=[
@@ -143,7 +270,7 @@ Return only one of these exact labels.
143
  ]
144
  }
145
  ],
146
- config=types.GenerateContentConfig(temperature=0.)
147
  )
148
  if resp and resp.text:
149
  classification = resp.text.strip().upper()
@@ -151,6 +278,8 @@ Return only one of these exact labels.
151
  return "THREE_COLUMN"
152
  elif "TWO" in classification:
153
  return "TWO_COLUMN"
 
 
154
  return "NO_TABLE"
155
  except Exception as e:
156
  logger.error(f"Gemini table classification error: {e}")
@@ -166,14 +295,158 @@ async def classify_image_async(image_data: bytes, api_key: str, max_retries: int
166
  preprocessed = preprocess_image(image_data)
167
  return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  class S3ImageWriter(DataWriter):
170
  def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
171
  self.s3_writer = s3_writer
172
- # Use the provided base_path (which can be based on the PDF file name)
173
  self.base_path = base_path if base_path.endswith("/") else base_path + "/"
174
  self.gemini_api_key = gemini_api_key
175
  self.descriptions = {}
176
  self._img_count = 0
 
 
 
177
 
178
  def write(self, path: str, data: bytes) -> None:
179
  self._img_count += 1
@@ -189,33 +462,45 @@ class S3ImageWriter(DataWriter):
189
 
190
  async def post_process_async(self, key: str, md_content: str) -> str:
191
  logger.info("Classifying images to detect tables.")
192
- tasks = []
193
- for p, info in self.descriptions.items():
194
- tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
195
-
196
- for p, task in tasks:
197
- try:
198
- classification = await task
199
- self.descriptions[p]['table_classification'] = classification
200
- except Exception as e:
201
- logger.error(f"Table classification error: {e}")
202
  self.descriptions[p]['table_classification'] = "NO_TABLE"
203
-
204
- for p, info in self.descriptions.items():
 
 
 
205
  cls = info['table_classification']
206
  if cls == "TWO_COLUMN":
207
  info['final_alt'] = "HAS TO BE PROCESSED - two column table"
208
  elif cls == "THREE_COLUMN":
209
  info['final_alt'] = "HAS TO BE PROCESSED - three column table"
 
 
 
 
 
 
 
 
 
210
  else:
211
  info['final_alt'] = "NO_TABLE image"
212
  md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
213
-
214
  md_content = await self._process_table_images_in_markdown(key, md_content)
215
- final_lines = []
216
- for line in md_content.split("\n"):
217
- if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
218
- final_lines.append(line.strip())
 
 
219
  return "\n".join(final_lines)
220
 
221
  async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
@@ -223,6 +508,7 @@ class S3ImageWriter(DataWriter):
223
  matches = re.findall(pat, md_content, flags=re.IGNORECASE)
224
  if not matches:
225
  return md_content
 
226
  for (col_type, s3_key) in matches:
227
  logger.info(f"Processing table image: {s3_key}, columns={col_type}")
228
  img_data = None
@@ -233,9 +519,12 @@ class S3ImageWriter(DataWriter):
233
  if img_data is None:
234
  logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
235
  continue
 
 
236
  with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
237
  temp_file.write(img_data)
238
  temp_path = temp_file.name
 
239
  try:
240
  if col_type.lower() == 'two':
241
  extractor = TableExtractor(
@@ -252,141 +541,93 @@ class S3ImageWriter(DataWriter):
252
  subtopic_threshold=0.2
253
  )
254
  row_boxes = extractor.process_image(temp_path)
255
- snippet = ["**Extracted table cells:**"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  for i, row in enumerate(row_boxes):
 
257
  for j, _ in enumerate(row):
258
- cell_unique_key = f"{self.base_path}cells/{os.path.basename(s3_key).split('.')[0]}_row{i}_col{j}.jpg"
259
- self.s3_writer.write(cell_unique_key, img_data)
260
- snippet.append(f"![Row {i} Col {j}]({cell_unique_key})")
261
- new_snip = "\n".join(snippet)
262
- old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
263
- md_content = md_content.replace(old_line, new_snip)
264
- except Exception as e:
265
- logger.error(f"Error processing table image {s3_key}: {e}")
266
- finally:
267
- try:
268
- os.remove(temp_path)
269
- except Exception:
270
- pass
271
- return md_content
272
 
273
- def post_process(self, key: str, md_content: str) -> str:
274
- return asyncio.run(self.post_process_async(key, md_content))
275
 
276
- class LocalImageWriter(DataWriter):
277
- def __init__(self, output_folder: str, gemini_api_key: str):
278
- self.output_folder = output_folder
279
- os.makedirs(self.output_folder, exist_ok=True)
280
- self.descriptions = {}
281
- self._img_count = 0
282
- self.gemini_api_key = gemini_api_key
283
- # New mapping to store extracted table cell image paths for testing.
284
- self.extracted_tables = {}
285
 
286
- def write(self, path: str, data: bytes) -> None:
287
- self._img_count += 1
288
- unique_id = f"img_{self._img_count}.jpg"
289
- self.descriptions[path] = {
290
- "data": data,
291
- "relative_path": unique_id,
292
- "table_classification": "NO_TABLE",
293
- "final_alt": ""
294
- }
295
- # Also save the original image locally for testing.
296
- image_path = os.path.join(self.output_folder, unique_id)
297
- with open(image_path, "wb") as f:
298
- f.write(data)
299
 
300
- async def post_process_async(self, key: str, md_content: str) -> str:
301
- logger.info("Classifying images to detect tables.")
302
- tasks = []
303
- for p, info in self.descriptions.items():
304
- tasks.append((p, classify_image_async(info["data"], self.gemini_api_key)))
305
- for p, task in tasks:
306
- try:
307
- classification = await task
308
- self.descriptions[p]['table_classification'] = classification
309
- except Exception as e:
310
- logger.error(f"Table classification error: {e}")
311
- self.descriptions[p]['table_classification'] = "NO_TABLE"
312
- for p, info in self.descriptions.items():
313
- cls = info['table_classification']
314
- if cls == "TWO_COLUMN":
315
- info['final_alt'] = "HAS TO BE PROCESSED - two column table"
316
- elif cls == "THREE_COLUMN":
317
- info['final_alt'] = "HAS TO BE PROCESSED - three column table"
318
- else:
319
- info['final_alt'] = "NO_TABLE image"
320
- md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['relative_path']})")
321
- md_content = self._process_table_images_in_markdown(md_content)
322
- final_lines = []
323
- for line in md_content.split("\n"):
324
- if re.match(r"^\!\[.*\]\(.*\)", line.strip()):
325
- final_lines.append(line.strip())
326
- return "\n".join(final_lines)
327
 
328
- def _process_table_images_in_markdown(self, md_content: str) -> str:
329
- pat = r"!\[HAS TO BE PROCESSED - (two|three) column table\]\(([^)]+)\)"
330
- matches = re.findall(pat, md_content, flags=re.IGNORECASE)
331
- if not matches:
332
- return md_content
333
- for (col_type, image_id) in matches:
334
- logger.info(f"Processing table image => {image_id}, columns={col_type}")
335
- temp_path = os.path.join(self.output_folder, image_id)
336
- desc_item = None
337
- for k, val in self.descriptions.items():
338
- if val["relative_path"] == image_id:
339
- desc_item = val
340
- break
341
- if not desc_item:
342
- logger.warning(f"No matching image data for {image_id}, skipping extraction.")
343
- continue
344
- if not os.path.exists(temp_path):
345
- with open(temp_path, "wb") as f:
346
- f.write(desc_item["data"])
347
- try:
348
- if col_type.lower() == 'two':
349
- extractor = TableExtractor(
350
- skip_header=True,
351
- merge_two_col_rows=True,
352
- enable_subtopic_merge=True,
353
- subtopic_threshold=0.2
354
- )
355
- else:
356
- extractor = TableExtractor(
357
- skip_header=True,
358
- merge_two_col_rows=False,
359
- enable_subtopic_merge=False,
360
- subtopic_threshold=0.2
361
- )
362
- row_boxes = extractor.process_image(temp_path)
363
- out_folder = temp_path + "_rows"
364
- os.makedirs(out_folder, exist_ok=True)
365
- extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
366
- # List all extracted cell images relative to the output folder.
367
- extracted_cells = []
368
- for root, dirs, files in os.walk(out_folder):
369
- for file in files:
370
- rel_path = os.path.relpath(os.path.join(root, file), self.output_folder)
371
- extracted_cells.append(rel_path)
372
- # Save mapping for testing.
373
- self.extracted_tables[image_id] = extracted_cells
374
  snippet = ["**Extracted table cells:**"]
375
  for i, row in enumerate(row_boxes):
376
- row_dir = os.path.join(out_folder, f"row_{i}")
377
  for j, _ in enumerate(row):
378
- cell_file = f"col_{j}.jpg"
379
- cell_path = os.path.join(row_dir, cell_file)
380
- relp = os.path.relpath(cell_path, self.output_folder)
381
- snippet.append(f"![Row {i} Col {j}]({relp})")
382
  new_snip = "\n".join(snippet)
383
- old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({image_id})"
384
  md_content = md_content.replace(old_line, new_snip)
 
385
  except Exception as e:
386
- logger.error(f"Error processing table image {image_id}: {e}")
387
  finally:
388
- if os.path.exists(temp_path):
389
- os.remove(temp_path)
390
  return md_content
391
 
392
  def post_process(self, key: str, md_content: str) -> str:
@@ -492,6 +733,15 @@ In that scenario, your output might look like:
492
  "2.3 A2 Unit 3": [24, 30],
493
  "2.4 A2 Unit 4": [31, 35]
494
  }}
 
 
 
 
 
 
 
 
 
495
  4. Another example might list subtopics:
496
  3.1 Overarching themes 11
497
  3.2 A: Proof 12
@@ -585,31 +835,24 @@ Now, extract topics from this text:
585
  return "\n".join(text_parts)
586
 
587
  class MineruNoTextProcessor:
588
- def __init__(self, output_folder: str, gemini_api_key: str = None):
589
  self.output_folder = output_folder
590
  os.makedirs(self.output_folder, exist_ok=True)
591
  self.layout_model = "doclayout_yolo"
592
  self.formula_enable = True
593
  self.table_enable = False
594
  self.language = "en"
595
- self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=10)
 
596
  self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
597
- # For testing via __main__, force local saving.
598
- if __name__ == "__main__":
599
- logger.info("Running in test mode: using local image writer.")
600
- self.use_s3 = False
601
- else:
602
- if (os.getenv("S3_ACCESS_KEY") and os.getenv("S3_SECRET_KEY") and
603
- os.getenv("S3_BUCKET_NAME") and os.getenv("S3_ENDPOINT")):
604
- self.use_s3 = True
605
- self.s3_writer = s3Writer(
606
- ak=os.getenv("S3_ACCESS_KEY"),
607
- sk=os.getenv("S3_SECRET_KEY"),
608
- bucket=os.getenv("S3_BUCKET_NAME"),
609
- endpoint_url=os.getenv("S3_ENDPOINT")
610
- )
611
- else:
612
- self.use_s3 = False
613
 
614
  def cleanup_gpu(self):
615
  try:
@@ -622,8 +865,10 @@ class MineruNoTextProcessor:
622
  def process(self, pdf_path: str) -> Dict[str, Any]:
623
  logger.info(f"Processing PDF: {pdf_path}")
624
  try:
 
625
  subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
626
  logger.info(f"Gemini returned subtopics: {subtopics}")
 
627
  if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
628
  response = requests.get(pdf_path)
629
  if response.status_code != 200:
@@ -635,46 +880,54 @@ class MineruNoTextProcessor:
635
  with open(pdf_path, "rb") as f:
636
  pdf_bytes = f.read()
637
  logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
 
638
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
639
  total_pages = doc.page_count
640
  doc.close()
 
 
641
  final_pages = set()
642
  if not subtopics:
643
- logger.warning("No subtopics found. Processing entire PDF as fallback.")
644
  final_pages = set(range(total_pages))
645
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
  for subname, rng in subtopics.items():
647
  if not (isinstance(rng, list) and len(rng) == 2):
648
- logger.warning(f"Skipping subtopic '{subname}' => invalid range {rng}")
649
  continue
650
  start_p, end_p = rng
651
  if start_p > end_p:
652
- logger.warning(f"Skipping subtopic '{subname}' => start > end {rng}")
653
  continue
654
- occs = find_all_occurrences(pdf_bytes, subname)
655
- logger.info(f"Occurrences of subtopic '{subname}': {occs}")
656
- doc_start_0 = start_p - 1
657
- chosen_page = None
658
- for p in occs:
659
- if p >= doc_start_0:
660
- chosen_page = p
661
- break
662
- if chosen_page is None:
663
- chosen_page = occs[-1] if occs else 0
664
- logger.warning(f"No suitable occurrence for '{subname}'. Using page {chosen_page}.")
665
- raw_offset = chosen_page - doc_start_0
666
- offset = max(0, raw_offset)
667
- s0 = (start_p - 1) + offset
668
- e0 = (end_p - 1) + offset
669
- s0 = max(0, min(total_pages - 1, s0))
670
- e0 = max(0, min(total_pages - 1, e0))
671
  for pp in range(s0, e0 + 1):
672
  final_pages.add(pp)
 
673
  if not final_pages:
674
- logger.warning("No valid pages after offset. Processing entire PDF.")
675
  final_pages = set(range(total_pages))
 
676
  logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
677
  subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
 
 
678
  dataset = PymuDocDataset(subset_pdf_bytes)
679
  inference = doc_analyze(
680
  dataset,
@@ -684,49 +937,36 @@ class MineruNoTextProcessor:
684
  formula_enable=self.formula_enable,
685
  table_enable=self.table_enable
686
  )
687
- logger.info("doc_analyze complete. Extracting images.")
688
- key = os.path.splitext(os.path.basename(pdf_path))[0]
689
- if self.use_s3:
690
- writer = S3ImageWriter(self.s3_writer, f"{key}/", self.gemini_api_key)
691
- md_prefix = f"{key}/"
692
- else:
693
- writer = LocalImageWriter(self.output_folder, self.gemini_api_key)
694
- md_prefix = "local-unique-prefix/"
695
  pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
696
  md_content = pipe_result.get_markdown(md_prefix)
697
  final_markdown = writer.post_process(md_prefix, md_content)
698
 
699
- output_json = {
700
- "subtopics": subtopics
701
- }
702
- if not self.use_s3 and isinstance(writer, LocalImageWriter):
703
- local_images = {k: v["relative_path"] for k, v in writer.descriptions.items()}
704
- tables_extracted = writer.extracted_tables
705
- output_json["local_images"] = local_images
706
- output_json["tables_extracted"] = tables_extracted
707
- # Save output in JSON format.
708
- out_json = json.dumps(output_json, indent=2)
709
- # Save JSON locally.
710
- out_path = os.path.join(self.output_folder, "final_output.json")
711
  with open(out_path, "w", encoding="utf-8") as f:
712
- f.write(out_json)
713
- logger.info(f"Final JSON saved locally at {out_path}")
714
- # Also save a local copy for testing.
715
- local_md_path = os.path.join(self.output_folder, "final_output_local.json")
716
- with open(local_md_path, "w", encoding="utf-8") as f:
717
- f.write(out_json)
718
- logger.info(f"Final JSON saved locally at {local_md_path}")
719
- return output_json
720
  finally:
721
  self.cleanup_gpu()
722
 
723
  if __name__ == "__main__":
724
- input_pdf = "/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf"
725
- output_dir = "/home/user/app/wje"
726
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
727
  try:
728
  processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
729
- result_json = processor.process(input_pdf)
730
  logger.info("Processing completed successfully.")
731
  except Exception as e:
732
  logger.error(f"Processing failed: {e}")
 
35
 
36
  _GEMINI_CLIENT = None
37
 
38
+ #helper functions, also global
39
  def unify_whitespace(text: str) -> str:
40
  return re.sub(r"\s+", " ", text).strip()
41
 
 
67
  doc.close()
68
  return subset_bytes
69
 
70
+ def unify_topic_name(raw_title: str, children_subtopics: list) -> str:
71
+ """
72
+ Clean up a topic title:
73
+ - Remove any trailing "continued".
74
+ - If the title does not start with a number but children provide a consistent numeric prefix,
75
+ then prepend that prefix.
76
+ """
77
+ title = raw_title.strip()
78
+ # Remove trailing "continued"
79
+ title = re.sub(r"\s+continued\s*$", "", title, flags=re.IGNORECASE)
80
+
81
+ # If title already starts with a number, use it as is.
82
+ if re.match(r"^\d+", title):
83
+ return title
84
+
85
+ # Otherwise, try to deduce a numeric prefix from the children.
86
+ prefixes = []
87
+ for child in children_subtopics:
88
+ child_title = child.get("title", "").strip()
89
+ m = re.match(r"^(\d+)\.", child_title)
90
+ if m:
91
+ prefixes.append(m.group(1))
92
+ if prefixes:
93
+ # If all numeric prefixes in children are the same, use that prefix.
94
+ if all(p == prefixes[0] for p in prefixes):
95
+ # If title is non-empty, prepend the number; otherwise, use a fallback.
96
+ if title:
97
+ title = f"{prefixes[0]} {title}"
98
+ else:
99
+ title = f"{prefixes[0]} Topic"
100
+ # Optionally, handle known broken titles explicitly.
101
+ if title.lower() in {"gonometry"}:
102
+ # For example, if children indicate "5.X", set to "5 Trigonometry"
103
+ if prefixes and prefixes[0] == "5":
104
+ title = "5 Trigonometry"
105
+ return title
106
+
107
+
108
+ def merge_topics(subtopic_list: list) -> list:
109
+ """
110
+ Merge topics with an enhanced logic:
111
+ 1. Clean up each topic's title using unify_topic_name.
112
+ 2. Group topics by the parent's numeric prefix (if available). Topics without a numeric prefix use their title.
113
+ 3. Reassign children: for each child whose title (e.g. "3.1") does not match its current parent's numeric prefix,
114
+ move it to the parent with the matching prefix if available.
115
+ 4. Remove duplicate children by merging contents.
116
+ 5. Sort parent topics and each parent's children by their numeric ordering.
117
+ """
118
+ # First, merge topics by parent's numeric prefix.
119
+ merged = {}
120
+ for topic_obj in subtopic_list:
121
+ raw_title = topic_obj.get("title", "")
122
+ children = topic_obj.get("children", [])
123
+ contents = topic_obj.get("contents", [])
124
+ new_title = unify_topic_name(raw_title, children)
125
+ # Extract parent's numeric prefix, if present.
126
+ m = re.match(r"^(\d+)", new_title)
127
+ parent_prefix = m.group(1) if m else None
128
+ key = parent_prefix if parent_prefix is not None else new_title
129
+
130
+ if key not in merged:
131
+ merged[key] = {
132
+ "title": new_title,
133
+ "contents": list(contents),
134
+ "children": list(children),
135
+ }
136
+ else:
137
+ # Merge contents and children; choose the longer title.
138
+ if len(new_title) > len(merged[key]["title"]):
139
+ merged[key]["title"] = new_title
140
+ merged[key]["contents"].extend(contents)
141
+ merged[key]["children"].extend(children)
142
+
143
+ # Build a lookup of merged topics by their numeric prefix.
144
+ parent_lookup = merged # keys are numeric prefixes or the full title for non-numeric ones.
145
+
146
+ # Reassign children to the correct parent based on their numeric prefix.
147
+ for key, topic in merged.items():
148
+ new_children = []
149
+ for child in topic["children"]:
150
+ child_title = child.get("title", "").strip()
151
+ m_child = re.match(r"^(\d+)\.", child_title)
152
+ if m_child:
153
+ child_prefix = m_child.group(1)
154
+ if key != child_prefix and child_prefix in parent_lookup:
155
+ # Reassign this child to the proper parent.
156
+ parent_lookup[child_prefix]["children"].append(child)
157
+ continue
158
+ new_children.append(child)
159
+ topic["children"] = new_children
160
+
161
+ # Remove duplicate children by merging their contents.
162
+ for topic in merged.values():
163
+ child_map = {}
164
+ for child in topic["children"]:
165
+ ctitle = child.get("title", "").strip()
166
+ if ctitle not in child_map:
167
+ child_map[ctitle] = child
168
+ else:
169
+ child_map[ctitle]["contents"].extend(child.get("contents", []))
170
+ child_map[ctitle]["children"].extend(child.get("children", []))
171
+ topic["children"] = list(child_map.values())
172
+
173
+ # Sort children by full numeric order (e.g. "2.1" < "2.10" < "2.2").
174
+ def parse_subtopic_num(subtitle):
175
+ digits = re.findall(r"\d+", subtitle)
176
+ return tuple(int(d) for d in digits) if digits else (9999,)
177
+ topic["children"].sort(key=lambda ch: parse_subtopic_num(ch.get("title", "")))
178
+
179
+ # Convert merged topics to a sorted list.
180
+ def parse_parent_num(topic):
181
+ m = re.match(r"^(\d+)", topic.get("title", ""))
182
+ return int(m.group(1)) if m else 9999
183
+ final_list = list(merged.values())
184
+ final_list.sort(key=lambda topic: parse_parent_num(topic))
185
+ return final_list
186
+
187
  class s3Writer:
188
  def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
189
  self.bucket = bucket
 
195
  )
196
 
197
  def write(self, path: str, data: bytes) -> None:
 
198
  try:
199
  file_obj = BytesIO(data)
200
  self.client.upload_fileobj(
 
207
  logger.error(f"Failed to upload to S3: {str(e)}")
208
  raise
209
 
210
+ def delete(self, path: str) -> None:
211
+ try:
212
+ self.client.delete_object(Bucket=self.bucket, Key=path)
213
+ except Exception as e:
214
+ logger.error(f"Failed to delete from S3: {str(e)}")
215
+ raise
216
+
217
  def preprocess_image(image_data: bytes, max_dim: int = 600, quality: int = 60) -> bytes:
218
  arr = np.frombuffer(image_data, np.uint8)
219
  img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
 
231
  return image_data
232
 
233
  def call_gemini_for_table_classification(image_data: bytes, api_key: str, max_retries: int = 1) -> str:
234
+ """
235
+ Existing Gemini call to classify an image as TWO_COLUMN, THREE_COLUMN, or NO_TABLE.
236
+ """
237
  for attempt in range(max_retries + 1):
238
  try:
239
  prompt = """You are given an image. Determine if it shows a table that has exactly 2 or 3 columns.
240
+ The three-column 'table' image includes such key features:
241
+ - Three columns header
242
+ - Headers like 'Topics', 'Content', 'Guidelines', 'Amplification', 'Additional guidance notes', 'Area of Study'
243
+ - Possibly sections (e.g. 8.4, 9.1)
244
+ The two-column 'table' image includes such key features:
245
+ - Two columns
246
+ - Headers like 'Subject content', 'Additional information'
247
+ - Possibly sections (e.g. 2.1, 3.4, G2, G3, )
 
 
 
248
  If the image is a relevant table with 2 columns, respond with 'TWO_COLUMN'.
249
  If the image is a relevant table with 3 columns, respond with 'THREE_COLUMN'.
250
+ If the image is non-empty but does not show a table, respond with 'NO_TABLE'.
251
  Return only one of these exact labels.
252
  """
253
  global _GEMINI_CLIENT
254
+ if _GEMINI_CLIENT is None:
255
+ _GEMINI_CLIENT = genai.Client(api_key=api_key)
256
  client = _GEMINI_CLIENT
257
+
258
  resp = client.models.generate_content(
259
  model="gemini-2.0-flash",
260
  contents=[
 
270
  ]
271
  }
272
  ],
273
+ config=types.GenerateContentConfig(temperature=0.0)
274
  )
275
  if resp and resp.text:
276
  classification = resp.text.strip().upper()
 
278
  return "THREE_COLUMN"
279
  elif "TWO" in classification:
280
  return "TWO_COLUMN"
281
+ elif "EMPTY" in classification:
282
+ return "EMPTY_IMAGE"
283
  return "NO_TABLE"
284
  except Exception as e:
285
  logger.error(f"Gemini table classification error: {e}")
 
295
  preprocessed = preprocess_image(image_data)
296
  return await loop.run_in_executor(None, call_gemini_for_table_classification, preprocessed, api_key, max_retries)
297
 
298
+ def call_gemini_for_subtopic_identification_image(image_data: bytes, api_key: str, max_retries: int = 1) -> dict:
299
+ for attempt in range(max_retries + 1):
300
+ try:
301
+ prompt = """
302
+ You are given an image from an educational curriculum specification. The image may contain:
303
+ 1) A main topic heading in the format: "<number> <Topic Name>", for example "2 Algebra and functions continued".
304
+ 2) A subtopic heading in the format "<number>.<number>", for example "2.5", "2.6", or "3.4".
305
+ 3) A label-like title in the left column of a two-column table, for example "G2", "G3", "Scarcity, choice and opportunity cost", or similar text without explicit numeric patterns (2.1, 3.4, etc.).
306
+ 4) Possibly no relevant text at all.
307
+
308
+ Your task is to extract:
309
+ - **"title"**: A recognized main topic or heading text.
310
+ - **"subtopics"**: Any recognized subtopic numbers (e.g. "2.5", "2.6", "3.4"), as an array of strings.
311
+
312
+ Follow these rules:
313
+
314
+ (1) **If the cell shows a main topic in the format "<number> <Topic Name>",** for example "2 Algebra and functions continued", (remove the word "continued") then:
315
+ - Put that text (without the word "continued") in "title". (e.g. "2 Algebra and functions")
316
+ - "subtopics" should be an empty array, unless you also see smaller subtopic numbers.
317
+
318
+ (2) **If the cell shows one or more subtopic numbers** in the format "<number>.<number>", for example "2.5", "2.6", or "3.4", then:
319
+ - Collect those exact strings in the JSON key "subtopics" (an array of strings).
320
+ - "title" in this case should be an empty string if you only detect subtopics.
321
+ (Example: If text is "2.5 Solve linear inequalities...", then "title" = "", "subtopics" = ["2.5"]).
322
+
323
+ (3) If no main topic or subtopic is detected but the text appears to be a heading (e.g. "Scarcity, choice and opportunity cost"), return:
324
+ {{
325
+ "title": "",
326
+ "subtopics": []
327
+ }}
328
+
329
+ (4) **If there is no numeric value in the left column** (e.g. "2.1" or "2 <Topic name>" not found) but the left column text appears to be a heading (for instance "Scarcity, choice and opportunity cost"), then:
330
+ - Use the **left column text** as "title".
331
+ - "subtopics" remains empty.
332
+ Example:
333
+ If the left column is "Scarcity, choice and opportunity cost" and the right column has definitions, your output is:
334
+ {
335
+ "title": "Scarcity, choice and opportunity cost",
336
+ "subtopics": []
337
+ }
338
+
339
+ (5) **If there is a character + digit pattern** in the left column for a two-column table (for example "G2", "G3", "G4", "C1"), treat that as a topic-like label:
340
+ - Put that label text into "title" (e.g. "G2").
341
+ - "subtopics" remains empty unless you also see actual subtopic formats like "2.5", "3.4" inside the same cell.
342
+
343
+ (6) **Output must be valid JSON** in this exact structure, with no extra text or explanation:
344
+ {
345
+ "title": "...",
346
+ "subtopics": [...]
347
+ }
348
+
349
+ (7) If the image is blank or truncated, defined as:
350
+ - Contains no words at all (e.g. a blank white or black image)
351
+ - Contains only a truncated snippet of words such as "Topics", "What students need to learn" with blue background
352
+ - Contains a truncated snippet with words like "Topics", "What students need to learn", "Content" with gray background (RGB (166,166,166) or (180,180,180)) then return:
353
+ {{
354
+ "title": "EMPTY_IMAGE",
355
+ "subtopics": []
356
+ }}
357
+
358
+ **Examples**:
359
+
360
+ - If the image text is `"2 Algebra and functions continued"`, return:
361
+ {
362
+ "title": "2 Algebra and functions",
363
+ "subtopics": []
364
+ }
365
+
366
+ - If the image text is `"2.5 Solve linear and quadratic inequalities ..."`, return:
367
+ {
368
+ "title": "",
369
+ "subtopics": ["2.5"]
370
+ }
371
+
372
+ - If the image text is `"Scarcity, choice and opportunity cost"` (with no numeric patterns at all), return:
373
+ {
374
+ "title": "Scarcity, choice and opportunity cost",
375
+ "subtopics": []
376
+ }
377
+
378
+ - If the left column says `"G2"` and the right column has details, but no subtopic numbers, return:
379
+ {
380
+ "title": "G2",
381
+ "subtopics": []
382
+ }
383
+
384
+ - If you cannot recognize any text matching these patterns, or if nothing is found, return:
385
+ {
386
+ "title": "",
387
+ "subtopics": []
388
+ }
389
+ """
390
+
391
+ global _GEMINI_CLIENT
392
+ if _GEMINI_CLIENT is None:
393
+ _GEMINI_CLIENT = genai.Client(api_key=api_key)
394
+ client = _GEMINI_CLIENT
395
+
396
+ resp = client.models.generate_content(
397
+ model="gemini-2.0-flash",
398
+ contents=[
399
+ {
400
+ "parts": [
401
+ {"text": prompt},
402
+ {
403
+ "inline_data": {
404
+ "mime_type": "image/jpeg",
405
+ "data": base64.b64encode(image_data).decode("utf-8")
406
+ }
407
+ }
408
+ ]
409
+ }
410
+ ],
411
+ config=types.GenerateContentConfig(temperature=0.0)
412
+ )
413
+
414
+ if not resp or not resp.text:
415
+ logger.warning("Gemini returned an empty response for subtopic extraction.")
416
+ return {"title": "", "subtopics": []}
417
+
418
+ raw = resp.text.strip()
419
+ # Remove any markdown fences if present
420
+ raw = raw.replace("```json", "").replace("```", "").strip()
421
+ data = json.loads(raw)
422
+
423
+ title = data.get("title", "")
424
+ subtopics = data.get("subtopics", [])
425
+ if title.upper() == "EMPTY_IMAGE":
426
+ return {"title": "EMPTY_IMAGE", "subtopics": []}
427
+ if not isinstance(subtopics, list):
428
+ subtopics = []
429
+ return {"title": title, "subtopics": subtopics}
430
+
431
+ except Exception as e:
432
+ logger.error(f"Gemini subtopic identification error on attempt {attempt}: {e}")
433
+ if attempt < max_retries:
434
+ time.sleep(0.5)
435
+ else:
436
+ return {"title": "", "subtopics": []}
437
+
438
+ return {"title": "", "subtopics": []}
439
+
440
  class S3ImageWriter(DataWriter):
441
  def __init__(self, s3_writer: s3Writer, base_path: str, gemini_api_key: str):
442
  self.s3_writer = s3_writer
 
443
  self.base_path = base_path if base_path.endswith("/") else base_path + "/"
444
  self.gemini_api_key = gemini_api_key
445
  self.descriptions = {}
446
  self._img_count = 0
447
+ self.extracted_tables = {}
448
+
449
+ self.extracted_subtopics = {}
450
 
451
  def write(self, path: str, data: bytes) -> None:
452
  self._img_count += 1
 
462
 
463
  async def post_process_async(self, key: str, md_content: str) -> str:
464
  logger.info("Classifying images to detect tables.")
465
+ tasks = {
466
+ p: asyncio.create_task(classify_image_async(info["data"], self.gemini_api_key))
467
+ for p, info in self.descriptions.items()
468
+ }
469
+ results = await asyncio.gather(*tasks.values(), return_exceptions=True)
470
+ for p, result in zip(list(self.descriptions.keys()), results):
471
+ if isinstance(result, Exception):
472
+ logger.error(f"Table classification error for {p}: {result}")
 
 
473
  self.descriptions[p]['table_classification'] = "NO_TABLE"
474
+ else:
475
+ self.descriptions[p]['table_classification'] = result
476
+
477
+ # Process each image description.
478
+ for p, info in list(self.descriptions.items()):
479
  cls = info['table_classification']
480
  if cls == "TWO_COLUMN":
481
  info['final_alt'] = "HAS TO BE PROCESSED - two column table"
482
  elif cls == "THREE_COLUMN":
483
  info['final_alt'] = "HAS TO BE PROCESSED - three column table"
484
+ elif cls == "EMPTY_IMAGE":
485
+ # Remove markdown reference, delete from descriptions and S3.
486
+ md_content = md_content.replace(f"![]({key}{p})", "")
487
+ try:
488
+ self.s3_writer.delete(info['s3_path'])
489
+ except Exception as e:
490
+ logger.error(f"Error deleting S3 object {info['s3_path']}: {e}")
491
+ del self.descriptions[p]
492
+ continue
493
  else:
494
  info['final_alt'] = "NO_TABLE image"
495
  md_content = md_content.replace(f"![]({key}{p})", f"![{info['final_alt']}]({info['s3_path']})")
496
+
497
  md_content = await self._process_table_images_in_markdown(key, md_content)
498
+
499
+ # Filter final lines to keep only lines with images.
500
+ final_lines = [
501
+ line.strip() for line in md_content.split("\n")
502
+ if re.match(r"^\!\[.*\]\(.*\)", line.strip())
503
+ ]
504
  return "\n".join(final_lines)
505
 
506
  async def _process_table_images_in_markdown(self, key: str, md_content: str) -> str:
 
508
  matches = re.findall(pat, md_content, flags=re.IGNORECASE)
509
  if not matches:
510
  return md_content
511
+
512
  for (col_type, s3_key) in matches:
513
  logger.info(f"Processing table image: {s3_key}, columns={col_type}")
514
  img_data = None
 
519
  if img_data is None:
520
  logger.warning(f"No image data found for S3 key {s3_key}. Skipping.")
521
  continue
522
+
523
+ # Write temporary file for processing.
524
  with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file:
525
  temp_file.write(img_data)
526
  temp_path = temp_file.name
527
+
528
  try:
529
  if col_type.lower() == 'two':
530
  extractor = TableExtractor(
 
541
  subtopic_threshold=0.2
542
  )
543
  row_boxes = extractor.process_image(temp_path)
544
+
545
+ # logger.info(f"Extracted {len(row_boxes)} rows from {temp_path}")
546
+ # for i, row in enumerate(row_boxes):
547
+ # logger.info(f"Row {i} has {len(row)} cells")
548
+
549
+ out_folder = temp_path + "_rows"
550
+ os.makedirs(out_folder, exist_ok=True)
551
+ # out_folder = os.path.join(os.path.dirname(temp_path), os.path.basename(temp_path) + "_rows")
552
+ # os.makedirs(out_folder, exist_ok=True)
553
+
554
+ extractor.save_extracted_cells(temp_path, row_boxes, out_folder)
555
+ #just to print structure how cells are saved and named for each table image
556
+ # logger.info(f"Files in {out_folder}:")
557
+ # for root, dirs, files in os.walk(out_folder):
558
+ # logger.info(f"{root}: {files}")
559
+
560
+ recognized_main_topic = ""
561
+ main_topic_image_key = None
562
+ recognized_subtopics = []
563
+
564
+ # Loop over each cell image.
565
  for i, row in enumerate(row_boxes):
566
+ row_dir = os.path.join(out_folder, f"row_{i}")
567
  for j, _ in enumerate(row):
568
+ cell_path = os.path.join(row_dir, f"col_{j}.png")
569
+ if not os.path.isfile(cell_path):
570
+ alternative_path = os.path.join(row_dir, f"col_{j}.jpg")
571
+ if os.path.isfile(alternative_path):
572
+ cell_path = alternative_path
573
+ else:
574
+ logger.warning(f"Cell image not found: {cell_path}")
575
+ continue
 
 
 
 
 
 
576
 
577
+ with open(cell_path, "rb") as cf:
578
+ cell_image_data = cf.read()
579
 
580
+ cell_key = f"{self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.png"
581
+ self.s3_writer.write(cell_key, cell_image_data)
 
 
 
 
 
 
 
582
 
583
+ #extract subtopic info from the cell image.
584
+ info = call_gemini_for_subtopic_identification_image(cell_image_data, self.gemini_api_key)
585
+
586
+ # Check if the image is empty.
587
+ if info.get("title", "").upper() == "EMPTY_IMAGE":
588
+ try:
589
+ self.s3_writer.delete(cell_key)
590
+ logger.info(f"Deleted empty cell image from S3: {cell_key}")
591
+ except Exception as e:
592
+ logger.error(f"Error deleting empty cell image {cell_key}: {e}")
593
+ continue # Skip processing this cell further
 
 
594
 
595
+ if info["title"] and not recognized_main_topic:
596
+ recognized_main_topic = info["title"]
597
+ main_topic_image_key = cell_key
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
 
599
+ for st in info["subtopics"]:
600
+ recognized_subtopics.append({
601
+ "title": st,
602
+ "contents": [{"type": "image", "key": cell_key}],
603
+ "children": []
604
+ })
605
+
606
+ final_json = {
607
+ "title": recognized_main_topic,
608
+ "contents": [],
609
+ "children": recognized_subtopics
610
+ }
611
+ if main_topic_image_key:
612
+ final_json["contents"].append({"type": "image", "key": main_topic_image_key})
613
+
614
+ # Save the final JSON.
615
+ self.extracted_subtopics[s3_key] = final_json
616
+
617
+ # Optionally, create a snippet to replace the markdown line.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
  snippet = ["**Extracted table cells:**"]
619
  for i, row in enumerate(row_boxes):
 
620
  for j, _ in enumerate(row):
621
+ snippet.append(f"![Row {i} Col {j}]({self.base_path}cells/{os.path.basename(s3_key)}_r{i}_c{j}.jpg)")
 
 
 
622
  new_snip = "\n".join(snippet)
623
+ old_line = f"![HAS TO BE PROCESSED - {col_type} column table]({s3_key})"
624
  md_content = md_content.replace(old_line, new_snip)
625
+
626
  except Exception as e:
627
+ logger.error(f"Error processing table image {s3_key}: {e}")
628
  finally:
629
+ os.remove(temp_path)
630
+
631
  return md_content
632
 
633
  def post_process(self, key: str, md_content: str) -> str:
 
733
  "2.3 A2 Unit 3": [24, 30],
734
  "2.4 A2 Unit 4": [31, 35]
735
  }}
736
+ or
737
+ 2.1 AS units 6
738
+ 2.2 AS units 23
739
+ In that scenario, your output might look like:
740
+ {{
741
+ "2.1 AS Unit 1": [6, 2],
742
+ "2.2 AS Unit 2": [23, 43]
743
+ }}
744
+
745
  4. Another example might list subtopics:
746
  3.1 Overarching themes 11
747
  3.2 A: Proof 12
 
835
  return "\n".join(text_parts)
836
 
837
  class MineruNoTextProcessor:
838
+ def __init__(self, output_folder: str, gemini_api_key: str):
839
  self.output_folder = output_folder
840
  os.makedirs(self.output_folder, exist_ok=True)
841
  self.layout_model = "doclayout_yolo"
842
  self.formula_enable = True
843
  self.table_enable = False
844
  self.language = "en"
845
+
846
+ self.subtopic_extractor = GeminiTopicExtractor(api_key=gemini_api_key, num_pages=20)
847
  self.gemini_api_key = gemini_api_key or os.getenv("GEMINI_API_KEY", "")
848
+
849
+ self.use_s3 = True
850
+ self.s3_writer = s3Writer(
851
+ ak=os.getenv("S3_ACCESS_KEY"),
852
+ sk=os.getenv("S3_SECRET_KEY"),
853
+ bucket="quextro-resources",
854
+ endpoint_url=os.getenv("S3_ENDPOINT")
855
+ )
 
 
 
 
 
 
 
 
856
 
857
  def cleanup_gpu(self):
858
  try:
 
865
  def process(self, pdf_path: str) -> Dict[str, Any]:
866
  logger.info(f"Processing PDF: {pdf_path}")
867
  try:
868
+ # Possibly call subtopic_extractor on first pages to find subtopics in the PDF as a whole
869
  subtopics = self.subtopic_extractor.extract_subtopics(pdf_path)
870
  logger.info(f"Gemini returned subtopics: {subtopics}")
871
+
872
  if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
873
  response = requests.get(pdf_path)
874
  if response.status_code != 200:
 
880
  with open(pdf_path, "rb") as f:
881
  pdf_bytes = f.read()
882
  logger.info("Loaded %d bytes from local file '%s'", len(pdf_bytes), pdf_path)
883
+
884
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
885
  total_pages = doc.page_count
886
  doc.close()
887
+
888
+ # Decide which pages to process
889
  final_pages = set()
890
  if not subtopics:
891
+ # fallback
892
  final_pages = set(range(total_pages))
893
  else:
894
+ offset_candidates = []
895
+ for subname, rng in subtopics.items():
896
+ start_p, _ = rng
897
+ occs = find_all_occurrences(pdf_bytes, subname)
898
+ for p in occs:
899
+ candidate = p - (start_p - 1)
900
+ if candidate > 0:
901
+ offset_candidates.append(candidate)
902
+ if offset_candidates:
903
+ try:
904
+ from statistics import mode
905
+ global_offset = mode(offset_candidates)
906
+ except:
907
+ from statistics import median
908
+ global_offset = int(median(offset_candidates))
909
+ else:
910
+ global_offset = 0
911
+
912
+ logger.info(f"Computed global offset: {global_offset}")
913
  for subname, rng in subtopics.items():
914
  if not (isinstance(rng, list) and len(rng) == 2):
 
915
  continue
916
  start_p, end_p = rng
917
  if start_p > end_p:
 
918
  continue
919
+ s0 = (start_p - 1) + global_offset
920
+ e0 = (end_p - 1) + global_offset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
921
  for pp in range(s0, e0 + 1):
922
  final_pages.add(pp)
923
+
924
  if not final_pages:
 
925
  final_pages = set(range(total_pages))
926
+
927
  logger.info(f"Processing pages (0-based): {sorted(final_pages)}")
928
  subset_pdf_bytes = create_subset_pdf(pdf_bytes, sorted(final_pages))
929
+
930
+ # 4) Analyze and produce markdown
931
  dataset = PymuDocDataset(subset_pdf_bytes)
932
  inference = doc_analyze(
933
  dataset,
 
937
  formula_enable=self.formula_enable,
938
  table_enable=self.table_enable
939
  )
940
+ #S3
941
+ writer = S3ImageWriter(self.s3_writer, "/topic-extraction", self.gemini_api_key)
942
+
943
+ md_prefix = "/topic-extraction/"
 
 
 
 
944
  pipe_result = inference.pipe_ocr_mode(writer, lang=self.language)
945
  md_content = pipe_result.get_markdown(md_prefix)
946
  final_markdown = writer.post_process(md_prefix, md_content)
947
 
948
+ subtopic_list = list(writer.extracted_subtopics.values())
949
+ subtopic_list = merge_topics(subtopic_list)
950
+
951
+ out_path = os.path.join(self.output_folder, "_subtopics.json")
 
 
 
 
 
 
 
 
952
  with open(out_path, "w", encoding="utf-8") as f:
953
+ json.dump(subtopic_list, f, indent=2)
954
+ logger.info(f"Final subtopics JSON saved locally at {out_path}")
955
+
956
+ return {
957
+ "final_markdown": final_markdown,
958
+ "subtopics_extracted": subtopic_list
959
+ }
 
960
  finally:
961
  self.cleanup_gpu()
962
 
963
  if __name__ == "__main__":
964
+ input_pdf = "/home/user/app/input_output/wjec-gce-as-a-economics-specification-from-2015.pdf"
965
+ output_dir = "/home/user/app/pearson_json"
966
  gemini_key = os.getenv("GEMINI_API_KEY", "AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
967
  try:
968
  processor = MineruNoTextProcessor(output_folder=output_dir, gemini_api_key=gemini_key)
969
+ result = processor.process(input_pdf)
970
  logger.info("Processing completed successfully.")
971
  except Exception as e:
972
  logger.error(f"Processing failed: {e}")
topic_extraction_ars.log DELETED
@@ -1,746 +0,0 @@
1
- 2025-03-03 15:45:38,171 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
2
- 2025-03-03 15:45:38,974 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
3
- 2025-03-03 15:45:38,975 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
4
- 2025-03-03 15:45:39,261 [INFO] __main__ - Computed global offset: 4
5
- 2025-03-03 15:45:39,261 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
6
- 2025-03-03 15:46:34,912 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
7
- 2025-03-03 15:46:36,964 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
8
- 2025-03-03 15:46:37,539 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
9
- 2025-03-03 15:46:38,161 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
10
- 2025-03-03 15:46:38,703 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
11
- 2025-03-03 15:46:39,330 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
12
- 2025-03-03 15:46:39,805 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
13
- 2025-03-03 15:46:40,281 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
14
- 2025-03-03 15:46:40,751 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
15
- 2025-03-03 15:46:41,336 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
16
- 2025-03-03 15:46:41,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
17
- 2025-03-03 15:46:42,431 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
18
- 2025-03-03 15:46:42,903 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
19
- 2025-03-03 15:46:43,490 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
20
- 2025-03-03 15:46:43,962 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
21
- 2025-03-03 15:46:44,566 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
22
- 2025-03-03 15:46:45,155 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
23
- 2025-03-03 15:46:45,448 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
24
- 2025-03-03 15:46:45,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
25
- 2025-03-03 15:46:46,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
26
- 2025-03-03 15:46:47,081 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
27
- 2025-03-03 15:46:47,652 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
28
- 2025-03-03 15:46:48,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
29
- 2025-03-03 15:46:48,593 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
30
- 2025-03-03 15:46:49,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
31
- 2025-03-03 15:46:49,644 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
32
- 2025-03-03 15:46:50,274 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
33
- 2025-03-03 15:46:50,891 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
34
- 2025-03-03 15:46:51,327 [INFO] __main__ - Classifying images to detect tables.
35
- 2025-03-03 15:46:55,176 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
36
- 2025-03-03 15:46:58,654 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
37
- 2025-03-03 15:46:58,952 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
38
- 2025-03-03 15:46:59,179 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
39
- 2025-03-03 15:46:59,433 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
40
- 2025-03-03 15:46:59,434 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
41
- 2025-03-03 15:47:02,885 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
42
- 2025-03-03 15:47:03,187 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
43
- 2025-03-03 15:47:03,419 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
44
- 2025-03-03 15:47:03,657 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
45
- 2025-03-03 15:47:03,872 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
46
- 2025-03-03 15:47:03,873 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
47
- 2025-03-03 15:47:07,421 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
48
- 2025-03-03 15:47:07,712 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
49
- 2025-03-03 15:47:07,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r1_c0.jpg
50
- 2025-03-03 15:47:07,918 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
51
- 2025-03-03 15:47:11,395 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
52
- 2025-03-03 15:47:11,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
53
- 2025-03-03 15:47:11,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
54
- 2025-03-03 15:47:12,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
55
- 2025-03-03 15:47:12,138 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
56
- 2025-03-03 15:47:15,853 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
57
- 2025-03-03 15:47:16,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
58
- 2025-03-03 15:47:16,379 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
59
- 2025-03-03 15:47:16,611 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
60
- 2025-03-03 15:47:16,850 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
61
- 2025-03-03 15:47:16,850 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
62
- 2025-03-03 15:47:20,810 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
63
- 2025-03-03 15:47:21,101 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
64
- 2025-03-03 15:47:21,322 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
65
- 2025-03-03 15:47:21,549 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
66
- 2025-03-03 15:47:21,549 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
67
- 2025-03-03 15:47:25,075 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
68
- 2025-03-03 15:47:25,405 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
69
- 2025-03-03 15:47:25,599 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
70
- 2025-03-03 15:47:25,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
71
- 2025-03-03 15:47:26,054 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c1.jpg
72
- 2025-03-03 15:47:26,054 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
73
- 2025-03-03 15:47:29,662 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
74
- 2025-03-03 15:47:29,944 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
75
- 2025-03-03 15:47:30,160 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c2.jpg
76
- 2025-03-03 15:47:30,354 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
77
- 2025-03-03 15:47:30,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
78
- 2025-03-03 15:47:30,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c2.jpg
79
- 2025-03-03 15:47:31,028 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
80
- 2025-03-03 15:47:31,232 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c1.jpg
81
- 2025-03-03 15:47:31,461 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
82
- 2025-03-03 15:47:31,654 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c1.jpg
83
- 2025-03-03 15:47:31,912 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
84
- 2025-03-03 15:47:32,139 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c1.jpg
85
- 2025-03-03 15:47:32,345 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c0.jpg
86
- 2025-03-03 15:47:32,586 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c1.jpg
87
- 2025-03-03 15:47:32,587 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
88
- 2025-03-03 15:47:36,350 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
89
- 2025-03-03 15:47:36,676 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c1.jpg
90
- 2025-03-03 15:47:36,893 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c2.jpg
91
- 2025-03-03 15:47:37,141 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
92
- 2025-03-03 15:47:37,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c1.jpg
93
- 2025-03-03 15:47:37,565 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c0.jpg
94
- 2025-03-03 15:47:37,760 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c1.jpg
95
- 2025-03-03 15:47:38,012 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c0.jpg
96
- 2025-03-03 15:47:38,226 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c1.jpg
97
- 2025-03-03 15:47:38,226 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
98
- 2025-03-03 15:47:42,402 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
99
- 2025-03-03 15:47:42,675 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
100
- 2025-03-03 15:47:42,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
101
- 2025-03-03 15:47:43,133 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
102
- 2025-03-03 15:47:43,355 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
103
- 2025-03-03 15:47:43,355 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
104
- 2025-03-03 15:47:48,037 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
105
- 2025-03-03 15:47:48,332 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
106
- 2025-03-03 15:47:48,540 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
107
- 2025-03-03 15:47:48,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
108
- 2025-03-03 15:47:49,037 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
109
- 2025-03-03 15:47:49,264 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r5_c0.jpg
110
- 2025-03-03 15:47:49,264 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
111
- 2025-03-03 15:47:53,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
112
- 2025-03-03 15:47:53,598 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
113
- 2025-03-03 15:47:53,819 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
114
- 2025-03-03 15:47:54,034 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
115
- 2025-03-03 15:47:54,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
116
- 2025-03-03 15:47:54,474 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
117
- 2025-03-03 15:47:54,474 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
118
- 2025-03-03 15:47:57,779 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
119
- 2025-03-03 15:47:58,103 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
120
- 2025-03-03 15:47:58,326 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
121
- 2025-03-03 15:47:58,545 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
122
- 2025-03-03 15:47:58,738 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
123
- 2025-03-03 15:47:58,994 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
124
- 2025-03-03 15:47:58,994 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
125
- 2025-03-03 15:48:03,866 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
126
- 2025-03-03 15:48:04,164 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c1.jpg
127
- 2025-03-03 15:48:04,382 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
128
- 2025-03-03 15:48:04,605 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c1.jpg
129
- 2025-03-03 15:48:04,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r2_c0.jpg
130
- 2025-03-03 15:48:05,032 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r3_c0.jpg
131
- 2025-03-03 15:48:05,247 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c0.jpg
132
- 2025-03-03 15:48:05,493 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c1.jpg
133
- 2025-03-03 15:48:05,710 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r5_c0.jpg
134
- 2025-03-03 15:48:05,711 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
135
- 2025-03-03 15:48:09,411 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
136
- 2025-03-03 15:48:09,698 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
137
- 2025-03-03 15:48:09,923 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
138
- 2025-03-03 15:48:10,113 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c1.jpg
139
- 2025-03-03 15:48:10,361 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
140
- 2025-03-03 15:48:10,587 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
141
- 2025-03-03 15:48:10,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r4_c0.jpg
142
- 2025-03-03 15:48:10,800 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
143
- 2025-03-03 15:48:14,668 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
144
- 2025-03-03 15:48:14,969 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
145
- 2025-03-03 15:48:15,207 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
146
- 2025-03-03 15:48:15,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c1.jpg
147
- 2025-03-03 15:48:15,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
148
- 2025-03-03 15:48:15,893 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
149
- 2025-03-03 15:48:16,111 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c1.jpg
150
- 2025-03-03 15:48:16,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
151
- 2025-03-03 15:48:17,176 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r5_c0.jpg
152
- 2025-03-03 15:48:17,176 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
153
- 2025-03-03 15:48:20,954 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
154
- 2025-03-03 15:48:21,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
155
- 2025-03-03 15:48:21,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
156
- 2025-03-03 15:48:21,634 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
157
- 2025-03-03 15:48:21,832 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
158
- 2025-03-03 15:48:22,056 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
159
- 2025-03-03 15:48:22,261 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
160
- 2025-03-03 15:48:22,481 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r5_c0.jpg
161
- 2025-03-03 15:48:22,482 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
162
- 2025-03-03 15:48:23,665 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
163
- 2025-03-03 15:48:23,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
164
- 2025-03-03 15:48:24,035 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
165
- 2025-03-03 15:48:24,219 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
166
- 2025-03-03 15:48:24,219 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
167
- 2025-03-03 15:48:27,206 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
168
- 2025-03-03 15:48:27,482 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
169
- 2025-03-03 15:48:27,693 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
170
- 2025-03-03 15:48:27,924 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
171
- 2025-03-03 15:48:28,131 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
172
- 2025-03-03 15:48:28,337 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
173
- 2025-03-03 15:48:28,338 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
174
- 2025-03-03 15:48:32,733 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
175
- 2025-03-03 15:48:32,995 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c1.jpg
176
- 2025-03-03 15:48:33,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c0.jpg
177
- 2025-03-03 15:48:33,449 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c1.jpg
178
- 2025-03-03 15:48:33,449 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
179
- 2025-03-03 15:48:37,495 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
180
- 2025-03-03 15:48:37,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
181
- 2025-03-03 15:48:38,060 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
182
- 2025-03-03 15:48:38,267 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
183
- 2025-03-03 15:48:38,267 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
184
- 2025-03-03 15:48:42,539 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
185
- 2025-03-03 15:48:42,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
186
- 2025-03-03 15:48:43,064 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
187
- 2025-03-03 15:48:43,280 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c1.jpg
188
- 2025-03-03 15:48:43,487 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c0.jpg
189
- 2025-03-03 15:48:43,716 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c1.jpg
190
- 2025-03-03 15:48:43,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r3_c0.jpg
191
- 2025-03-03 15:48:43,918 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
192
- 2025-03-03 15:48:47,600 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
193
- 2025-03-03 15:48:47,900 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
194
- 2025-03-03 15:48:48,125 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
195
- 2025-03-03 15:48:48,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
196
- 2025-03-03 15:48:48,343 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
197
- 2025-03-03 15:48:52,065 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
198
- 2025-03-03 15:48:52,376 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
199
- 2025-03-03 15:48:52,614 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
200
- 2025-03-03 15:48:52,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
201
- 2025-03-03 15:48:53,066 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
202
- 2025-03-03 15:48:53,066 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=two
203
- 2025-03-03 15:48:56,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
204
- 2025-03-03 15:48:56,863 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
205
- 2025-03-03 15:48:57,087 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r2_c0.jpg
206
- 2025-03-03 15:48:57,301 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r3_c0.jpg
207
- 2025-03-03 15:48:57,526 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r4_c0.jpg
208
- 2025-03-03 15:48:57,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r5_c0.jpg
209
- 2025-03-03 15:48:57,759 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
210
- 2025-03-03 15:49:01,116 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
211
- 2025-03-03 15:49:01,407 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
212
- 2025-03-03 15:49:01,618 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
213
- 2025-03-03 15:49:01,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
214
- 2025-03-03 15:49:01,847 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
215
- 2025-03-03 15:49:04,977 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
216
- 2025-03-03 15:49:05,258 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
217
- 2025-03-03 15:49:05,498 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
218
- 2025-03-03 15:49:05,712 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
219
- 2025-03-03 15:49:05,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
220
- 2025-03-03 15:49:06,162 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
221
- 2025-03-03 15:49:06,385 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
222
- 2025-03-03 15:49:06,612 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
223
- 2025-03-03 15:49:06,613 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
224
- 2025-03-03 15:49:10,036 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
225
- 2025-03-03 15:49:10,328 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
226
- 2025-03-03 15:49:10,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
227
- 2025-03-03 15:49:10,777 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r3_c0.jpg
228
- 2025-03-03 15:49:10,780 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/we_ars/final_subtopics.json
229
- 2025-03-03 15:49:11,098 [INFO] __main__ - GPU memory cleaned up.
230
- 2025-03-03 15:49:11,106 [INFO] __main__ - Processing completed successfully.
231
- 2025-03-03 15:53:27,401 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
232
- 2025-03-03 15:53:28,230 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
233
- 2025-03-03 15:53:28,231 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
234
- 2025-03-03 15:53:28,557 [INFO] __main__ - Computed global offset: 4
235
- 2025-03-03 15:53:28,557 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
236
- 2025-03-03 15:54:23,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
237
- 2025-03-03 15:54:25,210 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
238
- 2025-03-03 15:54:25,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
239
- 2025-03-03 15:54:26,250 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
240
- 2025-03-03 15:54:26,794 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
241
- 2025-03-03 15:54:27,347 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
242
- 2025-03-03 15:54:27,803 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
243
- 2025-03-03 15:54:28,391 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
244
- 2025-03-03 15:54:28,891 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
245
- 2025-03-03 15:54:29,437 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
246
- 2025-03-03 15:54:29,870 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
247
- 2025-03-03 15:54:30,421 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
248
- 2025-03-03 15:54:30,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
249
- 2025-03-03 15:54:31,438 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
250
- 2025-03-03 15:54:32,029 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
251
- 2025-03-03 15:54:32,600 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
252
- 2025-03-03 15:54:33,157 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
253
- 2025-03-03 15:54:33,444 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
254
- 2025-03-03 15:54:33,920 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
255
- 2025-03-03 15:54:34,554 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
256
- 2025-03-03 15:54:35,147 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
257
- 2025-03-03 15:54:35,680 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
258
- 2025-03-03 15:54:36,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
259
- 2025-03-03 15:54:36,554 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
260
- 2025-03-03 15:54:37,089 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
261
- 2025-03-03 15:54:37,502 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
262
- 2025-03-03 15:54:38,008 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
263
- 2025-03-03 15:54:38,585 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
264
- 2025-03-03 15:54:39,068 [INFO] __main__ - Classifying images to detect tables.
265
- 2025-03-03 15:54:42,753 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
266
- 2025-03-03 15:54:46,419 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
267
- 2025-03-03 15:54:46,711 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
268
- 2025-03-03 15:54:46,896 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
269
- 2025-03-03 15:54:47,110 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
270
- 2025-03-03 15:54:47,110 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
271
- 2025-03-03 15:54:50,464 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
272
- 2025-03-03 15:54:50,784 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
273
- 2025-03-03 15:54:50,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
274
- 2025-03-03 15:54:51,228 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
275
- 2025-03-03 15:54:51,462 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
276
- 2025-03-03 15:54:51,463 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
277
- 2025-03-03 15:54:55,079 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
278
- 2025-03-03 15:54:55,364 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
279
- 2025-03-03 15:54:55,570 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r1_c0.jpg
280
- 2025-03-03 15:54:55,571 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
281
- 2025-03-03 15:54:58,838 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
282
- 2025-03-03 15:54:59,144 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
283
- 2025-03-03 15:54:59,326 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
284
- 2025-03-03 15:54:59,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
285
- 2025-03-03 15:54:59,578 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
286
- 2025-03-03 15:55:03,518 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
287
- 2025-03-03 15:55:03,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
288
- 2025-03-03 15:55:03,983 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
289
- 2025-03-03 15:55:04,202 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
290
- 2025-03-03 15:55:04,417 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
291
- 2025-03-03 15:55:04,417 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
292
- 2025-03-03 15:55:08,109 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
293
- 2025-03-03 15:55:08,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
294
- 2025-03-03 15:55:08,629 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
295
- 2025-03-03 15:55:08,816 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
296
- 2025-03-03 15:55:08,816 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
297
- 2025-03-03 15:55:12,344 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
298
- 2025-03-03 15:55:12,644 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
299
- 2025-03-03 15:55:12,867 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
300
- 2025-03-03 15:55:13,114 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
301
- 2025-03-03 15:55:13,343 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c1.jpg
302
- 2025-03-03 15:55:13,344 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
303
- 2025-03-03 15:55:16,823 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
304
- 2025-03-03 15:55:17,140 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
305
- 2025-03-03 15:55:17,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c2.jpg
306
- 2025-03-03 15:55:17,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
307
- 2025-03-03 15:55:18,019 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
308
- 2025-03-03 15:55:18,320 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c2.jpg
309
- 2025-03-03 15:55:18,619 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
310
- 2025-03-03 15:55:18,911 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c1.jpg
311
- 2025-03-03 15:55:19,208 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
312
- 2025-03-03 15:55:19,491 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c1.jpg
313
- 2025-03-03 15:55:19,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
314
- 2025-03-03 15:55:20,093 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c1.jpg
315
- 2025-03-03 15:55:20,406 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c0.jpg
316
- 2025-03-03 15:55:20,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r5_c1.jpg
317
- 2025-03-03 15:55:20,690 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
318
- 2025-03-03 15:55:24,558 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
319
- 2025-03-03 15:55:24,859 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c1.jpg
320
- 2025-03-03 15:55:25,142 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c2.jpg
321
- 2025-03-03 15:55:25,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
322
- 2025-03-03 15:55:25,738 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c1.jpg
323
- 2025-03-03 15:55:26,031 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c0.jpg
324
- 2025-03-03 15:55:26,335 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r2_c1.jpg
325
- 2025-03-03 15:55:26,616 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c0.jpg
326
- 2025-03-03 15:55:26,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r3_c1.jpg
327
- 2025-03-03 15:55:26,909 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
328
- 2025-03-03 15:55:30,379 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
329
- 2025-03-03 15:55:30,667 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
330
- 2025-03-03 15:55:30,961 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
331
- 2025-03-03 15:55:31,248 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
332
- 2025-03-03 15:55:31,547 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
333
- 2025-03-03 15:55:31,549 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=two
334
- 2025-03-03 15:55:34,706 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
335
- 2025-03-03 15:55:34,994 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
336
- 2025-03-03 15:55:35,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
337
- 2025-03-03 15:55:35,558 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
338
- 2025-03-03 15:55:35,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
339
- 2025-03-03 15:55:36,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r5_c0.jpg
340
- 2025-03-03 15:55:36,137 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
341
- 2025-03-03 15:55:39,497 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
342
- 2025-03-03 15:55:39,757 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
343
- 2025-03-03 15:55:40,062 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
344
- 2025-03-03 15:55:40,345 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
345
- 2025-03-03 15:55:40,666 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
346
- 2025-03-03 15:55:40,976 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
347
- 2025-03-03 15:55:40,977 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
348
- 2025-03-03 15:55:44,159 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
349
- 2025-03-03 15:55:44,436 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
350
- 2025-03-03 15:55:44,643 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
351
- 2025-03-03 15:55:44,853 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
352
- 2025-03-03 15:55:45,041 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
353
- 2025-03-03 15:55:45,254 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
354
- 2025-03-03 15:55:45,255 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=three
355
- 2025-03-03 15:55:49,508 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
356
- 2025-03-03 15:55:49,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c1.jpg
357
- 2025-03-03 15:55:50,075 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
358
- 2025-03-03 15:55:50,355 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c1.jpg
359
- 2025-03-03 15:55:50,647 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r2_c0.jpg
360
- 2025-03-03 15:55:50,978 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r3_c0.jpg
361
- 2025-03-03 15:55:51,295 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c0.jpg
362
- 2025-03-03 15:55:51,582 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r4_c1.jpg
363
- 2025-03-03 15:55:51,855 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r5_c0.jpg
364
- 2025-03-03 15:55:51,856 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
365
- 2025-03-03 15:55:55,882 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
366
- 2025-03-03 15:55:56,182 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
367
- 2025-03-03 15:55:56,463 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
368
- 2025-03-03 15:55:56,727 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c1.jpg
369
- 2025-03-03 15:55:57,005 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
370
- 2025-03-03 15:55:57,301 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
371
- 2025-03-03 15:55:57,584 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r4_c0.jpg
372
- 2025-03-03 15:55:57,584 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
373
- 2025-03-03 15:56:01,615 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
374
- 2025-03-03 15:56:01,906 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
375
- 2025-03-03 15:56:02,222 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
376
- 2025-03-03 15:56:02,513 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c1.jpg
377
- 2025-03-03 15:56:02,801 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
378
- 2025-03-03 15:56:03,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
379
- 2025-03-03 15:56:03,393 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c1.jpg
380
- 2025-03-03 15:56:03,676 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
381
- 2025-03-03 15:56:04,667 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r5_c0.jpg
382
- 2025-03-03 15:56:04,667 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
383
- 2025-03-03 15:56:09,007 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
384
- 2025-03-03 15:56:09,286 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
385
- 2025-03-03 15:56:09,520 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
386
- 2025-03-03 15:56:09,740 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
387
- 2025-03-03 15:56:09,947 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
388
- 2025-03-03 15:56:10,171 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
389
- 2025-03-03 15:56:10,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
390
- 2025-03-03 15:56:10,610 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r5_c0.jpg
391
- 2025-03-03 15:56:10,610 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
392
- 2025-03-03 15:56:11,718 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
393
- 2025-03-03 15:56:11,899 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
394
- 2025-03-03 15:56:12,081 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
395
- 2025-03-03 15:56:12,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
396
- 2025-03-03 15:56:12,266 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
397
- 2025-03-03 15:56:15,231 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
398
- 2025-03-03 15:56:15,582 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
399
- 2025-03-03 15:56:15,802 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
400
- 2025-03-03 15:56:16,018 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
401
- 2025-03-03 15:56:16,234 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
402
- 2025-03-03 15:56:16,451 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
403
- 2025-03-03 15:56:16,452 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=three
404
- 2025-03-03 15:56:20,970 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
405
- 2025-03-03 15:56:21,300 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c1.jpg
406
- 2025-03-03 15:56:21,518 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c0.jpg
407
- 2025-03-03 15:56:21,742 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r1_c1.jpg
408
- 2025-03-03 15:56:21,742 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
409
- 2025-03-03 15:56:25,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
410
- 2025-03-03 15:56:25,883 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
411
- 2025-03-03 15:56:26,108 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
412
- 2025-03-03 15:56:26,319 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
413
- 2025-03-03 15:56:26,320 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
414
- 2025-03-03 15:56:30,722 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
415
- 2025-03-03 15:56:31,018 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
416
- 2025-03-03 15:56:31,267 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
417
- 2025-03-03 15:56:31,455 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c1.jpg
418
- 2025-03-03 15:56:31,684 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c0.jpg
419
- 2025-03-03 15:56:31,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r2_c1.jpg
420
- 2025-03-03 15:56:32,136 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r3_c0.jpg
421
- 2025-03-03 15:56:32,136 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
422
- 2025-03-03 15:56:35,410 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
423
- 2025-03-03 15:56:35,689 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
424
- 2025-03-03 15:56:35,917 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
425
- 2025-03-03 15:56:36,143 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
426
- 2025-03-03 15:56:36,144 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
427
- 2025-03-03 15:56:39,869 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
428
- 2025-03-03 15:56:40,150 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
429
- 2025-03-03 15:56:40,387 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
430
- 2025-03-03 15:56:40,608 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
431
- 2025-03-03 15:56:40,828 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
432
- 2025-03-03 15:56:40,829 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=two
433
- 2025-03-03 15:56:44,221 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
434
- 2025-03-03 15:56:44,522 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
435
- 2025-03-03 15:56:44,728 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r2_c0.jpg
436
- 2025-03-03 15:56:44,929 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r3_c0.jpg
437
- 2025-03-03 15:56:45,153 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r4_c0.jpg
438
- 2025-03-03 15:56:45,372 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r5_c0.jpg
439
- 2025-03-03 15:56:45,372 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
440
- 2025-03-03 15:56:48,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
441
- 2025-03-03 15:56:48,806 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
442
- 2025-03-03 15:56:49,036 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
443
- 2025-03-03 15:56:49,282 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
444
- 2025-03-03 15:56:49,282 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
445
- 2025-03-03 15:56:52,374 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
446
- 2025-03-03 15:56:52,664 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
447
- 2025-03-03 15:56:52,887 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
448
- 2025-03-03 15:56:53,103 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
449
- 2025-03-03 15:56:53,329 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
450
- 2025-03-03 15:56:53,543 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
451
- 2025-03-03 15:56:53,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
452
- 2025-03-03 15:56:53,978 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
453
- 2025-03-03 15:56:53,979 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
454
- 2025-03-03 15:56:57,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
455
- 2025-03-03 15:56:57,690 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
456
- 2025-03-03 15:56:57,897 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
457
- 2025-03-03 15:56:58,126 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r3_c0.jpg
458
- 2025-03-03 15:56:58,131 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/we_ars/final_subtopics.json
459
- 2025-03-03 15:56:58,438 [INFO] __main__ - GPU memory cleaned up.
460
- 2025-03-03 15:56:58,445 [INFO] __main__ - Processing completed successfully.
461
- 2025-03-03 17:28:40,888 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
462
- 2025-03-03 17:28:41,627 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
463
- 2025-03-03 17:28:41,628 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
464
- 2025-03-03 17:28:41,960 [INFO] __main__ - Computed global offset: 4
465
- 2025-03-03 17:28:41,961 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
466
- 2025-03-03 17:29:47,681 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
467
- 2025-03-03 17:29:50,244 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
468
- 2025-03-03 17:29:50,897 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
469
- 2025-03-03 17:29:51,556 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
470
- 2025-03-03 17:29:52,183 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
471
- 2025-03-03 17:29:52,887 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
472
- 2025-03-03 17:29:53,485 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
473
- 2025-03-03 17:29:54,194 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
474
- 2025-03-03 17:29:54,820 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
475
- 2025-03-03 17:29:55,457 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
476
- 2025-03-03 17:29:56,019 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
477
- 2025-03-03 17:29:56,666 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
478
- 2025-03-03 17:29:57,238 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
479
- 2025-03-03 17:29:57,934 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
480
- 2025-03-03 17:29:58,524 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
481
- 2025-03-03 17:29:59,210 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
482
- 2025-03-03 17:29:59,902 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
483
- 2025-03-03 17:30:00,309 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
484
- 2025-03-03 17:30:01,021 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
485
- 2025-03-03 17:30:01,692 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
486
- 2025-03-03 17:30:02,389 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
487
- 2025-03-03 17:30:03,066 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
488
- 2025-03-03 17:30:03,630 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
489
- 2025-03-03 17:30:04,225 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
490
- 2025-03-03 17:30:04,890 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
491
- 2025-03-03 17:30:05,488 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
492
- 2025-03-03 17:30:06,047 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
493
- 2025-03-03 17:30:06,794 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
494
- 2025-03-03 17:30:07,237 [INFO] __main__ - Classifying images to detect tables.
495
- 2025-03-03 17:30:11,295 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
496
- 2025-03-03 17:30:15,135 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c0.jpg
497
- 2025-03-03 17:30:15,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r0_c1.jpg
498
- 2025-03-03 17:30:15,662 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c0.jpg
499
- 2025-03-03 17:30:15,897 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_1.jpg_r1_c1.jpg
500
- 2025-03-03 17:30:15,898 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
501
- 2025-03-03 17:30:20,773 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c0.jpg
502
- 2025-03-03 17:30:21,085 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r0_c1.jpg
503
- 2025-03-03 17:30:21,321 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c0.jpg
504
- 2025-03-03 17:30:21,556 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r1_c1.jpg
505
- 2025-03-03 17:30:21,799 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r2_c0.jpg
506
- 2025-03-03 17:30:22,035 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_2.jpg_r3_c0.jpg
507
- 2025-03-03 17:30:22,036 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
508
- 2025-03-03 17:30:27,289 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c0.jpg
509
- 2025-03-03 17:30:27,603 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_3.jpg_r0_c1.jpg
510
- 2025-03-03 17:30:27,603 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
511
- 2025-03-03 17:30:33,266 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c0.jpg
512
- 2025-03-03 17:30:33,573 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r0_c1.jpg
513
- 2025-03-03 17:30:33,831 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c0.jpg
514
- 2025-03-03 17:30:34,027 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_4.jpg_r1_c1.jpg
515
- 2025-03-03 17:30:34,028 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
516
- 2025-03-03 17:30:39,478 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c0.jpg
517
- 2025-03-03 17:30:39,772 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r0_c1.jpg
518
- 2025-03-03 17:30:39,984 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c0.jpg
519
- 2025-03-03 17:30:40,240 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r1_c1.jpg
520
- 2025-03-03 17:30:40,466 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_5.jpg_r2_c0.jpg
521
- 2025-03-03 17:30:40,467 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
522
- 2025-03-03 17:30:44,908 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c0.jpg
523
- 2025-03-03 17:30:45,224 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r0_c1.jpg
524
- 2025-03-03 17:30:45,474 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c0.jpg
525
- 2025-03-03 17:30:45,669 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r1_c1.jpg
526
- 2025-03-03 17:30:45,909 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_6.jpg_r2_c0.jpg
527
- 2025-03-03 17:30:45,910 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
528
- 2025-03-03 17:30:50,049 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c0.jpg
529
- 2025-03-03 17:30:50,338 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r0_c1.jpg
530
- 2025-03-03 17:30:50,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c0.jpg
531
- 2025-03-03 17:30:50,772 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r1_c1.jpg
532
- 2025-03-03 17:30:51,001 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_7.jpg_r2_c0.jpg
533
- 2025-03-03 17:30:51,001 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
534
- 2025-03-03 17:30:54,784 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c0.jpg
535
- 2025-03-03 17:30:55,093 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r0_c1.jpg
536
- 2025-03-03 17:30:55,328 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c0.jpg
537
- 2025-03-03 17:30:55,552 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r1_c1.jpg
538
- 2025-03-03 17:30:55,777 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r2_c0.jpg
539
- 2025-03-03 17:30:56,026 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r3_c0.jpg
540
- 2025-03-03 17:30:56,240 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_8.jpg_r4_c0.jpg
541
- 2025-03-03 17:30:56,240 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
542
- 2025-03-03 17:31:00,457 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r0_c0.jpg
543
- 2025-03-03 17:31:00,759 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_9.jpg_r1_c0.jpg
544
- 2025-03-03 17:31:00,760 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
545
- 2025-03-03 17:31:04,717 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c0.jpg
546
- 2025-03-03 17:31:04,985 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r0_c1.jpg
547
- 2025-03-03 17:31:05,239 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r1_c0.jpg
548
- 2025-03-03 17:31:05,455 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r2_c0.jpg
549
- 2025-03-03 17:31:05,683 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_10.jpg_r3_c0.jpg
550
- 2025-03-03 17:31:05,684 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
551
- 2025-03-03 17:31:10,692 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r0_c0.jpg
552
- 2025-03-03 17:31:11,003 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r1_c0.jpg
553
- 2025-03-03 17:31:11,245 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r2_c0.jpg
554
- 2025-03-03 17:31:11,435 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r3_c0.jpg
555
- 2025-03-03 17:31:11,655 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_11.jpg_r4_c0.jpg
556
- 2025-03-03 17:31:11,655 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
557
- 2025-03-03 17:31:15,894 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c0.jpg
558
- 2025-03-03 17:31:16,213 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r0_c1.jpg
559
- 2025-03-03 17:31:16,433 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c0.jpg
560
- 2025-03-03 17:31:16,670 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r1_c1.jpg
561
- 2025-03-03 17:31:16,928 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c0.jpg
562
- 2025-03-03 17:31:17,120 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_12.jpg_r2_c1.jpg
563
- 2025-03-03 17:31:17,120 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
564
- 2025-03-03 17:31:20,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c0.jpg
565
- 2025-03-03 17:31:21,154 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r0_c1.jpg
566
- 2025-03-03 17:31:21,398 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c0.jpg
567
- 2025-03-03 17:31:21,637 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r1_c1.jpg
568
- 2025-03-03 17:31:21,856 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r2_c0.jpg
569
- 2025-03-03 17:31:22,094 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_13.jpg_r3_c0.jpg
570
- 2025-03-03 17:31:22,095 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=two
571
- 2025-03-03 17:31:27,406 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r0_c0.jpg
572
- 2025-03-03 17:31:27,685 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_14.jpg_r1_c0.jpg
573
- 2025-03-03 17:31:27,686 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
574
- 2025-03-03 17:31:32,916 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c0.jpg
575
- 2025-03-03 17:31:33,211 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r0_c1.jpg
576
- 2025-03-03 17:31:33,422 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r1_c0.jpg
577
- 2025-03-03 17:31:33,672 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r2_c0.jpg
578
- 2025-03-03 17:31:33,904 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_15.jpg_r3_c0.jpg
579
- 2025-03-03 17:31:33,904 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
580
- 2025-03-03 17:31:39,209 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c0.jpg
581
- 2025-03-03 17:31:39,525 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r0_c1.jpg
582
- 2025-03-03 17:31:39,778 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r1_c0.jpg
583
- 2025-03-03 17:31:40,003 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c0.jpg
584
- 2025-03-03 17:31:40,232 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r2_c1.jpg
585
- 2025-03-03 17:31:40,479 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r3_c0.jpg
586
- 2025-03-03 17:31:40,707 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_16.jpg_r4_c0.jpg
587
- 2025-03-03 17:31:40,708 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
588
- 2025-03-03 17:31:45,922 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c0.jpg
589
- 2025-03-03 17:31:46,235 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r0_c1.jpg
590
- 2025-03-03 17:31:46,463 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r1_c0.jpg
591
- 2025-03-03 17:31:46,691 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c0.jpg
592
- 2025-03-03 17:31:46,878 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r2_c1.jpg
593
- 2025-03-03 17:31:47,130 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r3_c0.jpg
594
- 2025-03-03 17:31:47,375 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_17.jpg_r4_c0.jpg
595
- 2025-03-03 17:31:47,376 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
596
- 2025-03-03 17:31:49,248 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c0.jpg
597
- 2025-03-03 17:31:49,508 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r0_c1.jpg
598
- 2025-03-03 17:31:49,702 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c0.jpg
599
- 2025-03-03 17:31:49,890 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_18.jpg_r1_c1.jpg
600
- 2025-03-03 17:31:49,891 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
601
- 2025-03-03 17:31:53,834 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c0.jpg
602
- 2025-03-03 17:31:54,137 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r0_c1.jpg
603
- 2025-03-03 17:31:54,379 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c0.jpg
604
- 2025-03-03 17:31:54,577 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r1_c1.jpg
605
- 2025-03-03 17:31:54,793 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c0.jpg
606
- 2025-03-03 17:31:55,019 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_19.jpg_r2_c1.jpg
607
- 2025-03-03 17:31:55,019 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=two
608
- 2025-03-03 17:32:00,652 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_20.jpg_r0_c0.jpg
609
- 2025-03-03 17:32:00,653 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
610
- 2025-03-03 17:32:05,661 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c0.jpg
611
- 2025-03-03 17:32:05,960 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r0_c1.jpg
612
- 2025-03-03 17:32:06,196 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c0.jpg
613
- 2025-03-03 17:32:06,457 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r1_c1.jpg
614
- 2025-03-03 17:32:06,707 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r2_c0.jpg
615
- 2025-03-03 17:32:06,940 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_21.jpg_r2_c1.jpg
616
- 2025-03-03 17:32:06,941 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
617
- 2025-03-03 17:32:12,376 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c0.jpg
618
- 2025-03-03 17:32:12,703 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r0_c1.jpg
619
- 2025-03-03 17:32:12,940 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_22.jpg_r1_c0.jpg
620
- 2025-03-03 17:32:12,941 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
621
- 2025-03-03 17:32:17,156 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c0.jpg
622
- 2025-03-03 17:32:17,423 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r0_c1.jpg
623
- 2025-03-03 17:32:17,698 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c0.jpg
624
- 2025-03-03 17:32:17,937 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_23.jpg_r1_c1.jpg
625
- 2025-03-03 17:32:17,938 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
626
- 2025-03-03 17:32:23,150 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c0.jpg
627
- 2025-03-03 17:32:23,450 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r0_c1.jpg
628
- 2025-03-03 17:32:23,675 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c0.jpg
629
- 2025-03-03 17:32:23,918 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r1_c1.jpg
630
- 2025-03-03 17:32:24,135 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_24.jpg_r2_c0.jpg
631
- 2025-03-03 17:32:24,136 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
632
- 2025-03-03 17:32:29,269 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c0.jpg
633
- 2025-03-03 17:32:29,548 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r0_c1.jpg
634
- 2025-03-03 17:32:29,771 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c0.jpg
635
- 2025-03-03 17:32:30,016 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_25.jpg_r1_c1.jpg
636
- 2025-03-03 17:32:30,016 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
637
- 2025-03-03 17:32:34,291 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c0.jpg
638
- 2025-03-03 17:32:34,576 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r0_c1.jpg
639
- 2025-03-03 17:32:34,811 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c0.jpg
640
- 2025-03-03 17:32:35,083 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r1_c1.jpg
641
- 2025-03-03 17:32:35,298 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_26.jpg_r2_c0.jpg
642
- 2025-03-03 17:32:35,299 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
643
- 2025-03-03 17:32:39,414 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c0.jpg
644
- 2025-03-03 17:32:39,710 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r0_c1.jpg
645
- 2025-03-03 17:32:39,965 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c0.jpg
646
- 2025-03-03 17:32:40,181 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r1_c1.jpg
647
- 2025-03-03 17:32:40,393 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r2_c0.jpg
648
- 2025-03-03 17:32:40,629 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r3_c0.jpg
649
- 2025-03-03 17:32:40,852 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c0.jpg
650
- 2025-03-03 17:32:41,080 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_27.jpg_r4_c1.jpg
651
- 2025-03-03 17:32:41,080 [INFO] __main__ - Processing table image: /topic-extraction/img_28.jpg, columns=two
652
- 2025-03-03 17:32:45,688 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r0_c0.jpg
653
- 2025-03-03 17:32:45,999 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r1_c0.jpg
654
- 2025-03-03 17:32:46,226 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c0.jpg
655
- 2025-03-03 17:32:46,462 [INFO] __main__ - Uploaded to S3: /topic-extraction/cells/table_s3_img_28.jpg_r2_c1.jpg
656
- 2025-03-03 17:32:46,468 [INFO] __main__ - Final subtopics JSON saved locally at /home/user/app/we/we_ars/final_subtopics.json
657
- 2025-03-03 17:32:46,930 [INFO] __main__ - GPU memory cleaned up.
658
- 2025-03-03 17:32:46,940 [INFO] __main__ - Processing completed successfully.
659
- 2025-03-03 17:42:37,923 [INFO] __main__ - Processing PDF: /home/user/app/input_output/a-level-pearson-mathematics-specification.pdf
660
- 2025-03-03 17:42:38,720 [INFO] __main__ - Gemini returned subtopics: {'Paper 1 and Paper 2: Pure Mathematics': [11, 29], 'Paper 3: Statistics and Mechanics': [30, 40]}
661
- 2025-03-03 17:42:38,721 [INFO] __main__ - Loaded 1135473 bytes from local file '/home/user/app/input_output/a-level-pearson-mathematics-specification.pdf'
662
- 2025-03-03 17:42:39,089 [INFO] __main__ - Computed global offset: 4
663
- 2025-03-03 17:42:39,090 [INFO] __main__ - Processing pages (0-based): [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
664
- 2025-03-03 17:43:33,813 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_1.jpg
665
- 2025-03-03 17:43:35,535 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_2.jpg
666
- 2025-03-03 17:43:36,124 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_3.jpg
667
- 2025-03-03 17:43:36,748 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_4.jpg
668
- 2025-03-03 17:43:37,282 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_5.jpg
669
- 2025-03-03 17:43:37,857 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_6.jpg
670
- 2025-03-03 17:43:38,322 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_7.jpg
671
- 2025-03-03 17:43:38,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_8.jpg
672
- 2025-03-03 17:43:39,279 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_9.jpg
673
- 2025-03-03 17:43:39,847 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_10.jpg
674
- 2025-03-03 17:43:40,400 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_11.jpg
675
- 2025-03-03 17:43:40,940 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_12.jpg
676
- 2025-03-03 17:43:41,381 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_13.jpg
677
- 2025-03-03 17:43:41,964 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_14.jpg
678
- 2025-03-03 17:43:42,436 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_15.jpg
679
- 2025-03-03 17:43:42,967 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_16.jpg
680
- 2025-03-03 17:43:43,518 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_17.jpg
681
- 2025-03-03 17:43:43,822 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_18.jpg
682
- 2025-03-03 17:43:44,428 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_19.jpg
683
- 2025-03-03 17:43:44,963 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_20.jpg
684
- 2025-03-03 17:43:45,639 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_21.jpg
685
- 2025-03-03 17:43:46,199 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_22.jpg
686
- 2025-03-03 17:43:46,786 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_23.jpg
687
- 2025-03-03 17:43:47,259 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_24.jpg
688
- 2025-03-03 17:43:47,796 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_25.jpg
689
- 2025-03-03 17:43:48,235 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_26.jpg
690
- 2025-03-03 17:43:48,656 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_27.jpg
691
- 2025-03-03 17:43:49,290 [INFO] __main__ - Uploaded to S3: /topic-extraction/img_28.jpg
692
- 2025-03-03 17:43:49,683 [INFO] __main__ - Classifying images to detect tables.
693
- 2025-03-03 17:43:53,784 [INFO] __main__ - Processing table image: /topic-extraction/img_1.jpg, columns=three
694
- 2025-03-03 17:43:56,550 [ERROR] __main__ - Error processing table image /topic-extraction/img_1.jpg: [Errno 2] No such file or directory: '/tmp/tmp63t8um4x.jpg_rows/row_0/col_0.jpg'
695
- 2025-03-03 17:43:56,550 [INFO] __main__ - Processing table image: /topic-extraction/img_2.jpg, columns=three
696
- 2025-03-03 17:43:59,443 [ERROR] __main__ - Error processing table image /topic-extraction/img_2.jpg: [Errno 2] No such file or directory: '/tmp/tmps0rsmzl6.jpg_rows/row_0/col_0.jpg'
697
- 2025-03-03 17:43:59,443 [INFO] __main__ - Processing table image: /topic-extraction/img_3.jpg, columns=three
698
- 2025-03-03 17:44:02,428 [ERROR] __main__ - Error processing table image /topic-extraction/img_3.jpg: [Errno 2] No such file or directory: '/tmp/tmpj4fx8a9s.jpg_rows/row_0/col_0.jpg'
699
- 2025-03-03 17:44:02,429 [INFO] __main__ - Processing table image: /topic-extraction/img_4.jpg, columns=three
700
- 2025-03-03 17:44:05,216 [ERROR] __main__ - Error processing table image /topic-extraction/img_4.jpg: [Errno 2] No such file or directory: '/tmp/tmpmumoju32.jpg_rows/row_0/col_0.jpg'
701
- 2025-03-03 17:44:05,216 [INFO] __main__ - Processing table image: /topic-extraction/img_5.jpg, columns=three
702
- 2025-03-03 17:44:08,445 [ERROR] __main__ - Error processing table image /topic-extraction/img_5.jpg: [Errno 2] No such file or directory: '/tmp/tmptekcelbx.jpg_rows/row_0/col_0.jpg'
703
- 2025-03-03 17:44:08,445 [INFO] __main__ - Processing table image: /topic-extraction/img_6.jpg, columns=three
704
- 2025-03-03 17:44:11,635 [ERROR] __main__ - Error processing table image /topic-extraction/img_6.jpg: [Errno 2] No such file or directory: '/tmp/tmpi4bsuwn6.jpg_rows/row_0/col_0.jpg'
705
- 2025-03-03 17:44:11,635 [INFO] __main__ - Processing table image: /topic-extraction/img_7.jpg, columns=three
706
- 2025-03-03 17:44:14,589 [ERROR] __main__ - Error processing table image /topic-extraction/img_7.jpg: [Errno 2] No such file or directory: '/tmp/tmpj_8l15kk.jpg_rows/row_0/col_0.jpg'
707
- 2025-03-03 17:44:14,589 [INFO] __main__ - Processing table image: /topic-extraction/img_8.jpg, columns=three
708
- 2025-03-03 17:44:17,836 [ERROR] __main__ - Error processing table image /topic-extraction/img_8.jpg: [Errno 2] No such file or directory: '/tmp/tmp3_kflaqs.jpg_rows/row_0/col_0.jpg'
709
- 2025-03-03 17:44:17,837 [INFO] __main__ - Processing table image: /topic-extraction/img_9.jpg, columns=three
710
- 2025-03-03 17:44:21,255 [ERROR] __main__ - Error processing table image /topic-extraction/img_9.jpg: [Errno 2] No such file or directory: '/tmp/tmpwuir45y0.jpg_rows/row_0/col_0.jpg'
711
- 2025-03-03 17:44:21,255 [INFO] __main__ - Processing table image: /topic-extraction/img_10.jpg, columns=three
712
- 2025-03-03 17:44:24,155 [ERROR] __main__ - Error processing table image /topic-extraction/img_10.jpg: [Errno 2] No such file or directory: '/tmp/tmpu2qia4ih.jpg_rows/row_0/col_0.jpg'
713
- 2025-03-03 17:44:24,155 [INFO] __main__ - Processing table image: /topic-extraction/img_11.jpg, columns=three
714
- 2025-03-03 17:44:27,346 [ERROR] __main__ - Error processing table image /topic-extraction/img_11.jpg: [Errno 2] No such file or directory: '/tmp/tmp5ucu_tbp.jpg_rows/row_0/col_0.jpg'
715
- 2025-03-03 17:44:27,346 [INFO] __main__ - Processing table image: /topic-extraction/img_12.jpg, columns=three
716
- 2025-03-03 17:44:30,489 [ERROR] __main__ - Error processing table image /topic-extraction/img_12.jpg: [Errno 2] No such file or directory: '/tmp/tmp_ciyju4y.jpg_rows/row_0/col_0.jpg'
717
- 2025-03-03 17:44:30,489 [INFO] __main__ - Processing table image: /topic-extraction/img_13.jpg, columns=three
718
- 2025-03-03 17:44:33,140 [ERROR] __main__ - Error processing table image /topic-extraction/img_13.jpg: [Errno 2] No such file or directory: '/tmp/tmp1_mz16x9.jpg_rows/row_0/col_0.jpg'
719
- 2025-03-03 17:44:33,141 [INFO] __main__ - Processing table image: /topic-extraction/img_14.jpg, columns=two
720
- 2025-03-03 17:44:36,423 [ERROR] __main__ - Error processing table image /topic-extraction/img_14.jpg: [Errno 2] No such file or directory: '/tmp/tmp_44dh1m3.jpg_rows/row_0/col_0.jpg'
721
- 2025-03-03 17:44:36,423 [INFO] __main__ - Processing table image: /topic-extraction/img_15.jpg, columns=three
722
- 2025-03-03 17:44:39,622 [ERROR] __main__ - Error processing table image /topic-extraction/img_15.jpg: [Errno 2] No such file or directory: '/tmp/tmp4e3y3440.jpg_rows/row_0/col_0.jpg'
723
- 2025-03-03 17:44:39,623 [INFO] __main__ - Processing table image: /topic-extraction/img_16.jpg, columns=three
724
- 2025-03-03 17:44:42,896 [ERROR] __main__ - Error processing table image /topic-extraction/img_16.jpg: [Errno 2] No such file or directory: '/tmp/tmp2njdfsc6.jpg_rows/row_0/col_0.jpg'
725
- 2025-03-03 17:44:42,896 [INFO] __main__ - Processing table image: /topic-extraction/img_17.jpg, columns=three
726
- 2025-03-03 17:44:46,043 [ERROR] __main__ - Error processing table image /topic-extraction/img_17.jpg: [Errno 2] No such file or directory: '/tmp/tmpwq0nk28o.jpg_rows/row_0/col_0.jpg'
727
- 2025-03-03 17:44:46,044 [INFO] __main__ - Processing table image: /topic-extraction/img_18.jpg, columns=three
728
- 2025-03-03 17:44:47,088 [ERROR] __main__ - Error processing table image /topic-extraction/img_18.jpg: [Errno 2] No such file or directory: '/tmp/tmpdx8gcoqg.jpg_rows/row_0/col_0.jpg'
729
- 2025-03-03 17:44:47,089 [INFO] __main__ - Processing table image: /topic-extraction/img_19.jpg, columns=three
730
- 2025-03-03 17:44:49,477 [ERROR] __main__ - Error processing table image /topic-extraction/img_19.jpg: [Errno 2] No such file or directory: '/tmp/tmp72627l8g.jpg_rows/row_0/col_0.jpg'
731
- 2025-03-03 17:44:49,478 [INFO] __main__ - Processing table image: /topic-extraction/img_20.jpg, columns=two
732
- 2025-03-03 17:44:53,018 [ERROR] __main__ - Error processing table image /topic-extraction/img_20.jpg: [Errno 2] No such file or directory: '/tmp/tmpdnic1_0w.jpg_rows/row_0/col_0.jpg'
733
- 2025-03-03 17:44:53,019 [INFO] __main__ - Processing table image: /topic-extraction/img_21.jpg, columns=three
734
- 2025-03-03 17:44:56,093 [ERROR] __main__ - Error processing table image /topic-extraction/img_21.jpg: [Errno 2] No such file or directory: '/tmp/tmpmhoh8yuy.jpg_rows/row_0/col_0.jpg'
735
- 2025-03-03 17:44:56,093 [INFO] __main__ - Processing table image: /topic-extraction/img_22.jpg, columns=three
736
- 2025-03-03 17:44:59,613 [ERROR] __main__ - Error processing table image /topic-extraction/img_22.jpg: [Errno 2] No such file or directory: '/tmp/tmp7ted27c7.jpg_rows/row_0/col_0.jpg'
737
- 2025-03-03 17:44:59,613 [INFO] __main__ - Processing table image: /topic-extraction/img_23.jpg, columns=three
738
- 2025-03-03 17:45:02,646 [ERROR] __main__ - Error processing table image /topic-extraction/img_23.jpg: [Errno 2] No such file or directory: '/tmp/tmpbr3_k9_v.jpg_rows/row_0/col_0.jpg'
739
- 2025-03-03 17:45:02,646 [INFO] __main__ - Processing table image: /topic-extraction/img_24.jpg, columns=three
740
- 2025-03-03 17:45:06,144 [ERROR] __main__ - Error processing table image /topic-extraction/img_24.jpg: [Errno 2] No such file or directory: '/tmp/tmpg6iw11r9.jpg_rows/row_0/col_0.jpg'
741
- 2025-03-03 17:45:06,145 [INFO] __main__ - Processing table image: /topic-extraction/img_25.jpg, columns=three
742
- 2025-03-03 17:45:09,409 [ERROR] __main__ - Error processing table image /topic-extraction/img_25.jpg: [Errno 2] No such file or directory: '/tmp/tmp_ntakmkl.jpg_rows/row_0/col_0.jpg'
743
- 2025-03-03 17:45:09,410 [INFO] __main__ - Processing table image: /topic-extraction/img_26.jpg, columns=three
744
- 2025-03-03 17:45:12,057 [ERROR] __main__ - Error processing table image /topic-extraction/img_26.jpg: [Errno 2] No such file or directory: '/tmp/tmp0k8i_n4p.jpg_rows/row_0/col_0.jpg'
745
- 2025-03-03 17:45:12,057 [INFO] __main__ - Processing table image: /topic-extraction/img_27.jpg, columns=three
746
- 2025-03-03 17:45:14,839 [INFO] __main__ - GPU memory cleaned up.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
we/final_subtopics.json DELETED
@@ -1,1139 +0,0 @@
1
- [
2
- {
3
- "title": "Topics",
4
- "contents": [
5
- {
6
- "type": "image",
7
- "key": "/topic-extraction/cells/img_1.jpg_r0_c0.png"
8
- }
9
- ],
10
- "children": [
11
- {
12
- "title": "1.1",
13
- "contents": [
14
- {
15
- "type": "image",
16
- "key": "/topic-extraction/cells/img_1.jpg_r1_c1.png"
17
- }
18
- ],
19
- "children": []
20
- }
21
- ]
22
- },
23
- {
24
- "title": "2 Algebra and functions",
25
- "contents": [
26
- {
27
- "type": "image",
28
- "key": "/topic-extraction/cells/img_2.jpg_r0_c0.png"
29
- }
30
- ],
31
- "children": [
32
- {
33
- "title": "2.1",
34
- "contents": [
35
- {
36
- "type": "image",
37
- "key": "/topic-extraction/cells/img_2.jpg_r0_c1.png"
38
- }
39
- ],
40
- "children": []
41
- },
42
- {
43
- "title": "2.2",
44
- "contents": [
45
- {
46
- "type": "image",
47
- "key": "/topic-extraction/cells/img_2.jpg_r1_c0.png"
48
- }
49
- ],
50
- "children": []
51
- },
52
- {
53
- "title": "2.3",
54
- "contents": [
55
- {
56
- "type": "image",
57
- "key": "/topic-extraction/cells/img_2.jpg_r2_c0.png"
58
- }
59
- ],
60
- "children": []
61
- },
62
- {
63
- "title": "2.4",
64
- "contents": [
65
- {
66
- "type": "image",
67
- "key": "/topic-extraction/cells/img_2.jpg_r3_c0.png"
68
- }
69
- ],
70
- "children": []
71
- }
72
- ]
73
- },
74
- {
75
- "title": "2 Algebra and functions continued",
76
- "contents": [
77
- {
78
- "type": "image",
79
- "key": "/topic-extraction/cells/img_3.jpg_r0_c0.png"
80
- }
81
- ],
82
- "children": [
83
- {
84
- "title": "2.5",
85
- "contents": [
86
- {
87
- "type": "image",
88
- "key": "/topic-extraction/cells/img_3.jpg_r0_c1.png"
89
- }
90
- ],
91
- "children": []
92
- },
93
- {
94
- "title": "2.6",
95
- "contents": [
96
- {
97
- "type": "image",
98
- "key": "/topic-extraction/cells/img_3.jpg_r1_c0.png"
99
- }
100
- ],
101
- "children": []
102
- }
103
- ]
104
- },
105
- {
106
- "title": "Topics",
107
- "contents": [
108
- {
109
- "type": "image",
110
- "key": "/topic-extraction/cells/img_4.jpg_r0_c0.png"
111
- }
112
- ],
113
- "children": [
114
- {
115
- "title": "2.7",
116
- "contents": [
117
- {
118
- "type": "image",
119
- "key": "/topic-extraction/cells/img_4.jpg_r1_c1.png"
120
- }
121
- ],
122
- "children": []
123
- }
124
- ]
125
- },
126
- {
127
- "title": "Topics",
128
- "contents": [
129
- {
130
- "type": "image",
131
- "key": "/topic-extraction/cells/img_5.jpg_r0_c0.png"
132
- }
133
- ],
134
- "children": [
135
- {
136
- "title": "2.8",
137
- "contents": [
138
- {
139
- "type": "image",
140
- "key": "/topic-extraction/cells/img_5.jpg_r1_c1.png"
141
- }
142
- ],
143
- "children": []
144
- },
145
- {
146
- "title": "2.9",
147
- "contents": [
148
- {
149
- "type": "image",
150
- "key": "/topic-extraction/cells/img_5.jpg_r2_c0.png"
151
- }
152
- ],
153
- "children": []
154
- }
155
- ]
156
- },
157
- {
158
- "title": "2 Algebra and functions continued",
159
- "contents": [
160
- {
161
- "type": "image",
162
- "key": "/topic-extraction/cells/img_6.jpg_r0_c0.png"
163
- }
164
- ],
165
- "children": [
166
- {
167
- "title": "2.11",
168
- "contents": [
169
- {
170
- "type": "image",
171
- "key": "/topic-extraction/cells/img_6.jpg_r0_c1.png"
172
- }
173
- ],
174
- "children": []
175
- },
176
- {
177
- "title": "3.1",
178
- "contents": [
179
- {
180
- "type": "image",
181
- "key": "/topic-extraction/cells/img_6.jpg_r1_c1.png"
182
- }
183
- ],
184
- "children": []
185
- }
186
- ]
187
- },
188
- {
189
- "title": "3 Coordinate geometry in the (x, y) plane continued",
190
- "contents": [
191
- {
192
- "type": "image",
193
- "key": "/topic-extraction/cells/img_7.jpg_r0_c0.png"
194
- }
195
- ],
196
- "children": [
197
- {
198
- "title": "3.3",
199
- "contents": [
200
- {
201
- "type": "image",
202
- "key": "/topic-extraction/cells/img_7.jpg_r0_c1.png"
203
- }
204
- ],
205
- "children": []
206
- },
207
- {
208
- "title": "3.4",
209
- "contents": [
210
- {
211
- "type": "image",
212
- "key": "/topic-extraction/cells/img_7.jpg_r1_c0.png"
213
- }
214
- ],
215
- "children": []
216
- },
217
- {
218
- "title": "4.1",
219
- "contents": [
220
- {
221
- "type": "image",
222
- "key": "/topic-extraction/cells/img_7.jpg_r2_c1.png"
223
- }
224
- ],
225
- "children": []
226
- }
227
- ]
228
- },
229
- {
230
- "title": "Topics",
231
- "contents": [
232
- {
233
- "type": "image",
234
- "key": "/topic-extraction/cells/img_8.jpg_r0_c0.png"
235
- }
236
- ],
237
- "children": [
238
- {
239
- "title": "4.2",
240
- "contents": [
241
- {
242
- "type": "image",
243
- "key": "/topic-extraction/cells/img_8.jpg_r1_c1.png"
244
- }
245
- ],
246
- "children": []
247
- },
248
- {
249
- "title": "4.3",
250
- "contents": [
251
- {
252
- "type": "image",
253
- "key": "/topic-extraction/cells/img_8.jpg_r2_c0.png"
254
- }
255
- ],
256
- "children": []
257
- },
258
- {
259
- "title": "4.4",
260
- "contents": [
261
- {
262
- "type": "image",
263
- "key": "/topic-extraction/cells/img_8.jpg_r3_c0.png"
264
- }
265
- ],
266
- "children": []
267
- },
268
- {
269
- "title": "4.5",
270
- "contents": [
271
- {
272
- "type": "image",
273
- "key": "/topic-extraction/cells/img_8.jpg_r4_c0.png"
274
- }
275
- ],
276
- "children": []
277
- },
278
- {
279
- "title": "4.6",
280
- "contents": [
281
- {
282
- "type": "image",
283
- "key": "/topic-extraction/cells/img_8.jpg_r5_c0.png"
284
- }
285
- ],
286
- "children": []
287
- }
288
- ]
289
- },
290
- {
291
- "title": "gonometry",
292
- "contents": [
293
- {
294
- "type": "image",
295
- "key": "/topic-extraction/cells/img_9.jpg_r0_c0.png"
296
- }
297
- ],
298
- "children": [
299
- {
300
- "title": "5.1",
301
- "contents": [
302
- {
303
- "type": "image",
304
- "key": "/topic-extraction/cells/img_9.jpg_r0_c1.png"
305
- }
306
- ],
307
- "children": []
308
- },
309
- {
310
- "title": "5.2",
311
- "contents": [
312
- {
313
- "type": "image",
314
- "key": "/topic-extraction/cells/img_9.jpg_r1_c0.png"
315
- }
316
- ],
317
- "children": []
318
- },
319
- {
320
- "title": "5.3",
321
- "contents": [
322
- {
323
- "type": "image",
324
- "key": "/topic-extraction/cells/img_9.jpg_r2_c0.png"
325
- }
326
- ],
327
- "children": []
328
- },
329
- {
330
- "title": "5.4",
331
- "contents": [
332
- {
333
- "type": "image",
334
- "key": "/topic-extraction/cells/img_9.jpg_r3_c0.png"
335
- }
336
- ],
337
- "children": []
338
- }
339
- ]
340
- },
341
- {
342
- "title": "5 Trigonometry continued",
343
- "contents": [
344
- {
345
- "type": "image",
346
- "key": "/topic-extraction/cells/img_10.jpg_r0_c0.png"
347
- }
348
- ],
349
- "children": [
350
- {
351
- "title": "5.5",
352
- "contents": [
353
- {
354
- "type": "image",
355
- "key": "/topic-extraction/cells/img_10.jpg_r0_c1.png"
356
- }
357
- ],
358
- "children": []
359
- },
360
- {
361
- "title": "5.6",
362
- "contents": [
363
- {
364
- "type": "image",
365
- "key": "/topic-extraction/cells/img_10.jpg_r1_c0.png"
366
- }
367
- ],
368
- "children": []
369
- },
370
- {
371
- "title": "5.7",
372
- "contents": [
373
- {
374
- "type": "image",
375
- "key": "/topic-extraction/cells/img_10.jpg_r2_c0.png"
376
- }
377
- ],
378
- "children": []
379
- },
380
- {
381
- "title": "5.8",
382
- "contents": [
383
- {
384
- "type": "image",
385
- "key": "/topic-extraction/cells/img_10.jpg_r3_c0.png"
386
- }
387
- ],
388
- "children": []
389
- }
390
- ]
391
- },
392
- {
393
- "title": "",
394
- "contents": [],
395
- "children": [
396
- {
397
- "title": "6.1",
398
- "contents": [
399
- {
400
- "type": "image",
401
- "key": "/topic-extraction/cells/img_11.jpg_r0_c0.png"
402
- }
403
- ],
404
- "children": []
405
- },
406
- {
407
- "title": "6.2",
408
- "contents": [
409
- {
410
- "type": "image",
411
- "key": "/topic-extraction/cells/img_11.jpg_r1_c0.png"
412
- }
413
- ],
414
- "children": []
415
- },
416
- {
417
- "title": "6.3",
418
- "contents": [
419
- {
420
- "type": "image",
421
- "key": "/topic-extraction/cells/img_11.jpg_r2_c0.png"
422
- }
423
- ],
424
- "children": []
425
- },
426
- {
427
- "title": "6.4",
428
- "contents": [
429
- {
430
- "type": "image",
431
- "key": "/topic-extraction/cells/img_11.jpg_r3_c0.png"
432
- }
433
- ],
434
- "children": []
435
- },
436
- {
437
- "title": "6.5",
438
- "contents": [
439
- {
440
- "type": "image",
441
- "key": "/topic-extraction/cells/img_11.jpg_r4_c0.png"
442
- }
443
- ],
444
- "children": []
445
- },
446
- {
447
- "title": "6.6",
448
- "contents": [
449
- {
450
- "type": "image",
451
- "key": "/topic-extraction/cells/img_11.jpg_r5_c0.png"
452
- }
453
- ],
454
- "children": []
455
- }
456
- ]
457
- },
458
- {
459
- "title": "Topics",
460
- "contents": [
461
- {
462
- "type": "image",
463
- "key": "/topic-extraction/cells/img_12.jpg_r0_c0.png"
464
- }
465
- ],
466
- "children": [
467
- {
468
- "title": "6.7",
469
- "contents": [
470
- {
471
- "type": "image",
472
- "key": "/topic-extraction/cells/img_12.jpg_r1_c1.png"
473
- }
474
- ],
475
- "children": []
476
- },
477
- {
478
- "title": "7.1",
479
- "contents": [
480
- {
481
- "type": "image",
482
- "key": "/topic-extraction/cells/img_12.jpg_r2_c1.png"
483
- }
484
- ],
485
- "children": []
486
- }
487
- ]
488
- },
489
- {
490
- "title": "Topics",
491
- "contents": [
492
- {
493
- "type": "image",
494
- "key": "/topic-extraction/cells/img_13.jpg_r0_c0.png"
495
- }
496
- ],
497
- "children": [
498
- {
499
- "title": "7.1",
500
- "contents": [
501
- {
502
- "type": "image",
503
- "key": "/topic-extraction/cells/img_13.jpg_r1_c1.png"
504
- }
505
- ],
506
- "children": []
507
- },
508
- {
509
- "title": "7.2",
510
- "contents": [
511
- {
512
- "type": "image",
513
- "key": "/topic-extraction/cells/img_13.jpg_r2_c0.png"
514
- }
515
- ],
516
- "children": []
517
- },
518
- {
519
- "title": "7.3",
520
- "contents": [
521
- {
522
- "type": "image",
523
- "key": "/topic-extraction/cells/img_13.jpg_r3_c0.png"
524
- }
525
- ],
526
- "children": []
527
- }
528
- ]
529
- },
530
- {
531
- "title": "Topics",
532
- "contents": [
533
- {
534
- "type": "image",
535
- "key": "/topic-extraction/cells/img_14.jpg_r0_c0.png"
536
- }
537
- ],
538
- "children": [
539
- {
540
- "title": "7.4",
541
- "contents": [
542
- {
543
- "type": "image",
544
- "key": "/topic-extraction/cells/img_14.jpg_r1_c1.png"
545
- }
546
- ],
547
- "children": []
548
- },
549
- {
550
- "title": "7.5",
551
- "contents": [
552
- {
553
- "type": "image",
554
- "key": "/topic-extraction/cells/img_14.jpg_r2_c0.png"
555
- }
556
- ],
557
- "children": []
558
- },
559
- {
560
- "title": "7.6",
561
- "contents": [
562
- {
563
- "type": "image",
564
- "key": "/topic-extraction/cells/img_14.jpg_r3_c0.png"
565
- }
566
- ],
567
- "children": []
568
- },
569
- {
570
- "title": "8.1",
571
- "contents": [
572
- {
573
- "type": "image",
574
- "key": "/topic-extraction/cells/img_14.jpg_r4_c1.png"
575
- }
576
- ],
577
- "children": []
578
- },
579
- {
580
- "title": "8.2",
581
- "contents": [
582
- {
583
- "type": "image",
584
- "key": "/topic-extraction/cells/img_14.jpg_r5_c0.png"
585
- }
586
- ],
587
- "children": []
588
- }
589
- ]
590
- },
591
- {
592
- "title": "Topics",
593
- "contents": [
594
- {
595
- "type": "image",
596
- "key": "/topic-extraction/cells/img_15.jpg_r0_c0.png"
597
- }
598
- ],
599
- "children": [
600
- {
601
- "title": "8.3",
602
- "contents": [
603
- {
604
- "type": "image",
605
- "key": "/topic-extraction/cells/img_15.jpg_r1_c1.png"
606
- }
607
- ],
608
- "children": []
609
- },
610
- {
611
- "title": "8.4",
612
- "contents": [
613
- {
614
- "type": "image",
615
- "key": "/topic-extraction/cells/img_15.jpg_r2_c0.png"
616
- }
617
- ],
618
- "children": []
619
- },
620
- {
621
- "title": "8.5",
622
- "contents": [
623
- {
624
- "type": "image",
625
- "key": "/topic-extraction/cells/img_15.jpg_r3_c0.png"
626
- }
627
- ],
628
- "children": []
629
- },
630
- {
631
- "title": "8.6",
632
- "contents": [
633
- {
634
- "type": "image",
635
- "key": "/topic-extraction/cells/img_15.jpg_r4_c0.png"
636
- }
637
- ],
638
- "children": []
639
- }
640
- ]
641
- },
642
- {
643
- "title": "Topics",
644
- "contents": [
645
- {
646
- "type": "image",
647
- "key": "/topic-extraction/cells/img_16.jpg_r0_c0.png"
648
- }
649
- ],
650
- "children": [
651
- {
652
- "title": "8.7",
653
- "contents": [
654
- {
655
- "type": "image",
656
- "key": "/topic-extraction/cells/img_16.jpg_r1_c1.png"
657
- }
658
- ],
659
- "children": []
660
- },
661
- {
662
- "title": "8.8",
663
- "contents": [
664
- {
665
- "type": "image",
666
- "key": "/topic-extraction/cells/img_16.jpg_r2_c0.png"
667
- }
668
- ],
669
- "children": []
670
- },
671
- {
672
- "title": "9.1",
673
- "contents": [
674
- {
675
- "type": "image",
676
- "key": "/topic-extraction/cells/img_16.jpg_r3_c1.png"
677
- }
678
- ],
679
- "children": []
680
- },
681
- {
682
- "title": "9.2",
683
- "contents": [
684
- {
685
- "type": "image",
686
- "key": "/topic-extraction/cells/img_16.jpg_r4_c0.png"
687
- }
688
- ],
689
- "children": []
690
- },
691
- {
692
- "title": "9.3",
693
- "contents": [
694
- {
695
- "type": "image",
696
- "key": "/topic-extraction/cells/img_16.jpg_r5_c0.png"
697
- }
698
- ],
699
- "children": []
700
- }
701
- ]
702
- },
703
- {
704
- "title": "9 Numerical methods",
705
- "contents": [
706
- {
707
- "type": "image",
708
- "key": "/topic-extraction/cells/img_17.jpg_r0_c0.png"
709
- }
710
- ],
711
- "children": [
712
- {
713
- "title": "9.4",
714
- "contents": [
715
- {
716
- "type": "image",
717
- "key": "/topic-extraction/cells/img_17.jpg_r0_c1.png"
718
- }
719
- ],
720
- "children": []
721
- },
722
- {
723
- "title": "9.5",
724
- "contents": [
725
- {
726
- "type": "image",
727
- "key": "/topic-extraction/cells/img_17.jpg_r1_c0.png"
728
- }
729
- ],
730
- "children": []
731
- },
732
- {
733
- "title": "10.1",
734
- "contents": [
735
- {
736
- "type": "image",
737
- "key": "/topic-extraction/cells/img_17.jpg_r2_c1.png"
738
- }
739
- ],
740
- "children": []
741
- },
742
- {
743
- "title": "10.2",
744
- "contents": [
745
- {
746
- "type": "image",
747
- "key": "/topic-extraction/cells/img_17.jpg_r3_c0.png"
748
- }
749
- ],
750
- "children": []
751
- },
752
- {
753
- "title": "10.3",
754
- "contents": [
755
- {
756
- "type": "image",
757
- "key": "/topic-extraction/cells/img_17.jpg_r4_c0.png"
758
- }
759
- ],
760
- "children": []
761
- },
762
- {
763
- "title": "10.4",
764
- "contents": [
765
- {
766
- "type": "image",
767
- "key": "/topic-extraction/cells/img_17.jpg_r5_c0.png"
768
- }
769
- ],
770
- "children": []
771
- }
772
- ]
773
- },
774
- {
775
- "title": "Topics",
776
- "contents": [
777
- {
778
- "type": "image",
779
- "key": "/topic-extraction/cells/img_18.jpg_r0_c0.png"
780
- }
781
- ],
782
- "children": [
783
- {
784
- "title": "10.5",
785
- "contents": [
786
- {
787
- "type": "image",
788
- "key": "/topic-extraction/cells/img_18.jpg_r1_c1.png"
789
- }
790
- ],
791
- "children": []
792
- }
793
- ]
794
- },
795
- {
796
- "title": "Topics",
797
- "contents": [
798
- {
799
- "type": "image",
800
- "key": "/topic-extraction/cells/img_19.jpg_r0_c0.png"
801
- }
802
- ],
803
- "children": [
804
- {
805
- "title": "1.1",
806
- "contents": [
807
- {
808
- "type": "image",
809
- "key": "/topic-extraction/cells/img_19.jpg_r1_c1.png"
810
- }
811
- ],
812
- "children": []
813
- },
814
- {
815
- "title": "2.1",
816
- "contents": [
817
- {
818
- "type": "image",
819
- "key": "/topic-extraction/cells/img_19.jpg_r2_c1.png"
820
- }
821
- ],
822
- "children": []
823
- }
824
- ]
825
- },
826
- {
827
- "title": "Topics",
828
- "contents": [
829
- {
830
- "type": "image",
831
- "key": "/topic-extraction/cells/img_20.jpg_r0_c0.png"
832
- }
833
- ],
834
- "children": [
835
- {
836
- "title": "2.2",
837
- "contents": [
838
- {
839
- "type": "image",
840
- "key": "/topic-extraction/cells/img_20.jpg_r1_c1.png"
841
- }
842
- ],
843
- "children": []
844
- }
845
- ]
846
- },
847
- {
848
- "title": "2 Data presentation and interpretation continued",
849
- "contents": [
850
- {
851
- "type": "image",
852
- "key": "/topic-extraction/cells/img_21.jpg_r0_c0.png"
853
- }
854
- ],
855
- "children": [
856
- {
857
- "title": "2.4",
858
- "contents": [
859
- {
860
- "type": "image",
861
- "key": "/topic-extraction/cells/img_21.jpg_r0_c1.png"
862
- }
863
- ],
864
- "children": []
865
- },
866
- {
867
- "title": "3.1",
868
- "contents": [
869
- {
870
- "type": "image",
871
- "key": "/topic-extraction/cells/img_21.jpg_r1_c1.png"
872
- }
873
- ],
874
- "children": []
875
- }
876
- ]
877
- },
878
- {
879
- "title": "Topics",
880
- "contents": [
881
- {
882
- "type": "image",
883
- "key": "/topic-extraction/cells/img_22.jpg_r0_c0.png"
884
- }
885
- ],
886
- "children": [
887
- {
888
- "title": "3.3",
889
- "contents": [
890
- {
891
- "type": "image",
892
- "key": "/topic-extraction/cells/img_22.jpg_r1_c1.png"
893
- }
894
- ],
895
- "children": []
896
- },
897
- {
898
- "title": "4.1",
899
- "contents": [
900
- {
901
- "type": "image",
902
- "key": "/topic-extraction/cells/img_22.jpg_r2_c1.png"
903
- }
904
- ],
905
- "children": []
906
- },
907
- {
908
- "title": "4.2",
909
- "contents": [
910
- {
911
- "type": "image",
912
- "key": "/topic-extraction/cells/img_22.jpg_r3_c0.png"
913
- }
914
- ],
915
- "children": []
916
- }
917
- ]
918
- },
919
- {
920
- "title": "4 Statistical distributions continued",
921
- "contents": [
922
- {
923
- "type": "image",
924
- "key": "/topic-extraction/cells/img_23.jpg_r0_c0.png"
925
- }
926
- ],
927
- "children": [
928
- {
929
- "title": "4.3",
930
- "contents": [
931
- {
932
- "type": "image",
933
- "key": "/topic-extraction/cells/img_23.jpg_r0_c1.png"
934
- }
935
- ],
936
- "children": []
937
- },
938
- {
939
- "title": "5.1",
940
- "contents": [
941
- {
942
- "type": "image",
943
- "key": "/topic-extraction/cells/img_23.jpg_r1_c1.png"
944
- }
945
- ],
946
- "children": []
947
- }
948
- ]
949
- },
950
- {
951
- "title": "Topics",
952
- "contents": [
953
- {
954
- "type": "image",
955
- "key": "/topic-extraction/cells/img_24.jpg_r0_c0.png"
956
- }
957
- ],
958
- "children": [
959
- {
960
- "title": "5.2",
961
- "contents": [
962
- {
963
- "type": "image",
964
- "key": "/topic-extraction/cells/img_24.jpg_r1_c1.png"
965
- }
966
- ],
967
- "children": []
968
- },
969
- {
970
- "title": "5.3",
971
- "contents": [
972
- {
973
- "type": "image",
974
- "key": "/topic-extraction/cells/img_24.jpg_r2_c0.png"
975
- }
976
- ],
977
- "children": []
978
- }
979
- ]
980
- },
981
- {
982
- "title": "",
983
- "contents": [],
984
- "children": [
985
- {
986
- "title": "7.1",
987
- "contents": [
988
- {
989
- "type": "image",
990
- "key": "/topic-extraction/cells/img_25.jpg_r1_c0.png"
991
- }
992
- ],
993
- "children": []
994
- },
995
- {
996
- "title": "7.2",
997
- "contents": [
998
- {
999
- "type": "image",
1000
- "key": "/topic-extraction/cells/img_25.jpg_r2_c0.png"
1001
- }
1002
- ],
1003
- "children": []
1004
- },
1005
- {
1006
- "title": "7.3",
1007
- "contents": [
1008
- {
1009
- "type": "image",
1010
- "key": "/topic-extraction/cells/img_25.jpg_r3_c0.png"
1011
- }
1012
- ],
1013
- "children": []
1014
- },
1015
- {
1016
- "title": "7.4",
1017
- "contents": [
1018
- {
1019
- "type": "image",
1020
- "key": "/topic-extraction/cells/img_25.jpg_r4_c0.png"
1021
- }
1022
- ],
1023
- "children": []
1024
- },
1025
- {
1026
- "title": "7.5",
1027
- "contents": [
1028
- {
1029
- "type": "image",
1030
- "key": "/topic-extraction/cells/img_25.jpg_r5_c0.png"
1031
- }
1032
- ],
1033
- "children": []
1034
- }
1035
- ]
1036
- },
1037
- {
1038
- "title": "8 Forces and Newton's laws",
1039
- "contents": [
1040
- {
1041
- "type": "image",
1042
- "key": "/topic-extraction/cells/img_26.jpg_r0_c0.png"
1043
- }
1044
- ],
1045
- "children": [
1046
- {
1047
- "title": "8.1",
1048
- "contents": [
1049
- {
1050
- "type": "image",
1051
- "key": "/topic-extraction/cells/img_26.jpg_r0_c1.png"
1052
- }
1053
- ],
1054
- "children": []
1055
- },
1056
- {
1057
- "title": "8.2",
1058
- "contents": [
1059
- {
1060
- "type": "image",
1061
- "key": "/topic-extraction/cells/img_26.jpg_r1_c0.png"
1062
- }
1063
- ],
1064
- "children": []
1065
- },
1066
- {
1067
- "title": "8.3",
1068
- "contents": [
1069
- {
1070
- "type": "image",
1071
- "key": "/topic-extraction/cells/img_26.jpg_r2_c0.png"
1072
- }
1073
- ],
1074
- "children": []
1075
- }
1076
- ]
1077
- },
1078
- {
1079
- "title": "Topics",
1080
- "contents": [
1081
- {
1082
- "type": "image",
1083
- "key": "/topic-extraction/cells/img_27.jpg_r0_c0.png"
1084
- }
1085
- ],
1086
- "children": [
1087
- {
1088
- "title": "8.4",
1089
- "contents": [
1090
- {
1091
- "type": "image",
1092
- "key": "/topic-extraction/cells/img_27.jpg_r1_c1.png"
1093
- }
1094
- ],
1095
- "children": []
1096
- },
1097
- {
1098
- "title": "8.5",
1099
- "contents": [
1100
- {
1101
- "type": "image",
1102
- "key": "/topic-extraction/cells/img_27.jpg_r2_c0.png"
1103
- }
1104
- ],
1105
- "children": []
1106
- },
1107
- {
1108
- "title": "8.6",
1109
- "contents": [
1110
- {
1111
- "type": "image",
1112
- "key": "/topic-extraction/cells/img_27.jpg_r3_c0.png"
1113
- }
1114
- ],
1115
- "children": []
1116
- },
1117
- {
1118
- "title": "9.1",
1119
- "contents": [
1120
- {
1121
- "type": "image",
1122
- "key": "/topic-extraction/cells/img_27.jpg_r4_c1.png"
1123
- }
1124
- ],
1125
- "children": []
1126
- }
1127
- ]
1128
- },
1129
- {
1130
- "title": "Reason, interpret and communicate mathematically",
1131
- "contents": [
1132
- {
1133
- "type": "image",
1134
- "key": "/topic-extraction/cells/img_28.jpg_r1_c0.png"
1135
- }
1136
- ],
1137
- "children": []
1138
- }
1139
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
we/we_ars/final_subtopics.json DELETED
@@ -1,282 +0,0 @@
1
- [
2
- {
3
- "title": null,
4
- "contents": [
5
- {
6
- "type": "image",
7
- "key": "/topic-extraction/img_1.jpg"
8
- }
9
- ],
10
- "children": []
11
- },
12
- {
13
- "title": null,
14
- "contents": [
15
- {
16
- "type": "image",
17
- "key": "/topic-extraction/img_2.jpg"
18
- }
19
- ],
20
- "children": []
21
- },
22
- {
23
- "title": null,
24
- "contents": [
25
- {
26
- "type": "image",
27
- "key": "/topic-extraction/img_3.jpg"
28
- }
29
- ],
30
- "children": []
31
- },
32
- {
33
- "title": null,
34
- "contents": [
35
- {
36
- "type": "image",
37
- "key": "/topic-extraction/img_4.jpg"
38
- }
39
- ],
40
- "children": []
41
- },
42
- {
43
- "title": null,
44
- "contents": [
45
- {
46
- "type": "image",
47
- "key": "/topic-extraction/img_5.jpg"
48
- }
49
- ],
50
- "children": []
51
- },
52
- {
53
- "title": null,
54
- "contents": [
55
- {
56
- "type": "image",
57
- "key": "/topic-extraction/img_6.jpg"
58
- }
59
- ],
60
- "children": []
61
- },
62
- {
63
- "title": null,
64
- "contents": [
65
- {
66
- "type": "image",
67
- "key": "/topic-extraction/img_7.jpg"
68
- }
69
- ],
70
- "children": []
71
- },
72
- {
73
- "title": null,
74
- "contents": [
75
- {
76
- "type": "image",
77
- "key": "/topic-extraction/img_8.jpg"
78
- }
79
- ],
80
- "children": []
81
- },
82
- {
83
- "title": null,
84
- "contents": [
85
- {
86
- "type": "image",
87
- "key": "/topic-extraction/img_9.jpg"
88
- }
89
- ],
90
- "children": []
91
- },
92
- {
93
- "title": null,
94
- "contents": [
95
- {
96
- "type": "image",
97
- "key": "/topic-extraction/img_10.jpg"
98
- }
99
- ],
100
- "children": []
101
- },
102
- {
103
- "title": null,
104
- "contents": [
105
- {
106
- "type": "image",
107
- "key": "/topic-extraction/img_11.jpg"
108
- }
109
- ],
110
- "children": []
111
- },
112
- {
113
- "title": null,
114
- "contents": [
115
- {
116
- "type": "image",
117
- "key": "/topic-extraction/img_12.jpg"
118
- }
119
- ],
120
- "children": []
121
- },
122
- {
123
- "title": null,
124
- "contents": [
125
- {
126
- "type": "image",
127
- "key": "/topic-extraction/img_13.jpg"
128
- }
129
- ],
130
- "children": []
131
- },
132
- {
133
- "title": null,
134
- "contents": [
135
- {
136
- "type": "image",
137
- "key": "/topic-extraction/img_14.jpg"
138
- }
139
- ],
140
- "children": []
141
- },
142
- {
143
- "title": null,
144
- "contents": [
145
- {
146
- "type": "image",
147
- "key": "/topic-extraction/img_15.jpg"
148
- }
149
- ],
150
- "children": []
151
- },
152
- {
153
- "title": null,
154
- "contents": [
155
- {
156
- "type": "image",
157
- "key": "/topic-extraction/img_16.jpg"
158
- }
159
- ],
160
- "children": []
161
- },
162
- {
163
- "title": null,
164
- "contents": [
165
- {
166
- "type": "image",
167
- "key": "/topic-extraction/img_17.jpg"
168
- }
169
- ],
170
- "children": []
171
- },
172
- {
173
- "title": null,
174
- "contents": [
175
- {
176
- "type": "image",
177
- "key": "/topic-extraction/img_18.jpg"
178
- }
179
- ],
180
- "children": []
181
- },
182
- {
183
- "title": null,
184
- "contents": [
185
- {
186
- "type": "image",
187
- "key": "/topic-extraction/img_19.jpg"
188
- }
189
- ],
190
- "children": []
191
- },
192
- {
193
- "title": null,
194
- "contents": [
195
- {
196
- "type": "image",
197
- "key": "/topic-extraction/img_20.jpg"
198
- }
199
- ],
200
- "children": []
201
- },
202
- {
203
- "title": null,
204
- "contents": [
205
- {
206
- "type": "image",
207
- "key": "/topic-extraction/img_21.jpg"
208
- }
209
- ],
210
- "children": []
211
- },
212
- {
213
- "title": null,
214
- "contents": [
215
- {
216
- "type": "image",
217
- "key": "/topic-extraction/img_22.jpg"
218
- }
219
- ],
220
- "children": []
221
- },
222
- {
223
- "title": null,
224
- "contents": [
225
- {
226
- "type": "image",
227
- "key": "/topic-extraction/img_23.jpg"
228
- }
229
- ],
230
- "children": []
231
- },
232
- {
233
- "title": null,
234
- "contents": [
235
- {
236
- "type": "image",
237
- "key": "/topic-extraction/img_24.jpg"
238
- }
239
- ],
240
- "children": []
241
- },
242
- {
243
- "title": null,
244
- "contents": [
245
- {
246
- "type": "image",
247
- "key": "/topic-extraction/img_25.jpg"
248
- }
249
- ],
250
- "children": []
251
- },
252
- {
253
- "title": null,
254
- "contents": [
255
- {
256
- "type": "image",
257
- "key": "/topic-extraction/img_26.jpg"
258
- }
259
- ],
260
- "children": []
261
- },
262
- {
263
- "title": null,
264
- "contents": [
265
- {
266
- "type": "image",
267
- "key": "/topic-extraction/img_27.jpg"
268
- }
269
- ],
270
- "children": []
271
- },
272
- {
273
- "title": null,
274
- "contents": [
275
- {
276
- "type": "image",
277
- "key": "/topic-extraction/img_28.jpg"
278
- }
279
- ],
280
- "children": []
281
- }
282
- ]