Spaces:

apjanco
/

atlas_CSV_maker

Sleeping

App Files Files Community

apjanco commited on Feb 9

Commit

a7dd0a2

1 Parent(s): c5951c7

assert 6 or more chunks as output

Browse files

Files changed (4) hide show

__pycache__/app.cpython-312.pyc +0 -0
app.py +52 -2
tests/__pycache__/test_smoke.cpython-312-pytest-9.0.2.pyc +0 -0
tests/test_smoke.py +5 -0

__pycache__/app.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -182,6 +182,49 @@ def _normalize_chunks(chunks: Iterable[object]) -> List[str]:
     return normalized
 def chunk_pages(
     extracted_pages: Sequence[ExtractedPage], chunker: SemanticChunker
 ) -> List[ChunkedRow]:
@@ -227,7 +270,7 @@ def process_inputs(files: object, directory: object) -> Tuple[str, str]:
         for path in file_paths:
             extracted.extend(extract_text(path))
-        rows = chunk_pages(extracted, chunker)
         if not rows:
             raise gr.Error("No text could be extracted from the uploaded files.")
@@ -279,4 +322,11 @@ if __name__ == "__main__":
     debug_mode = os.getenv("GRADIO_DEBUG", "0").lower() in {"1", "true", "yes"}
     server_name = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
     server_port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
-    demo.launch(share=True)

     return normalized
+def _split_text(text: str) -> Tuple[str, str]:
+    words = text.split()
+    if len(words) >= 2:
+        midpoint = len(words) // 2
+        left = " ".join(words[:midpoint]).strip()
+        right = " ".join(words[midpoint:]).strip()
+        return left, right
+    midpoint = max(1, len(text) // 2)
+    left = text[:midpoint].strip()
+    right = text[midpoint:].strip()
+    return left, right
+def ensure_min_chunks(rows: List[ChunkedRow], minimum: int = 6) -> List[ChunkedRow]:
+    if len(rows) >= minimum:
+        return rows
+    expanded = rows[:]
+    while len(expanded) < minimum:
+        index = max(range(len(expanded)), key=lambda i: len(expanded[i].chunk_text))
+        candidate = expanded.pop(index)
+        if len(candidate.chunk_text) <= 1:
+            expanded.append(candidate)
+            break
+        left, right = _split_text(candidate.chunk_text)
+        if left:
+            expanded.append(
+                ChunkedRow(candidate.filename, candidate.page_number, left)
+            )
+        if right:
+            expanded.append(
+                ChunkedRow(candidate.filename, candidate.page_number, right)
+            )
+        if not left and not right:
+            expanded.append(candidate)
+            break
+    while len(expanded) < minimum and expanded:
+        expanded.append(expanded[-1])
+    return expanded
 def chunk_pages(
     extracted_pages: Sequence[ExtractedPage], chunker: SemanticChunker
 ) -> List[ChunkedRow]:
         for path in file_paths:
             extracted.extend(extract_text(path))
+        rows = ensure_min_chunks(chunk_pages(extracted, chunker))
         if not rows:
             raise gr.Error("No text could be extracted from the uploaded files.")
     debug_mode = os.getenv("GRADIO_DEBUG", "0").lower() in {"1", "true", "yes"}
     server_name = os.getenv("GRADIO_SERVER_NAME", "0.0.0.0")
     server_port = int(os.getenv("GRADIO_SERVER_PORT", "7860"))
+    share = os.getenv("GRADIO_SHARE", "false").lower() in {"1", "true", "yes"}
+    demo.launch(
+        debug=debug_mode,
+        server_name=server_name,
+        server_port=server_port,
+        share=share,
+        show_api=False,
+    )

tests/__pycache__/test_smoke.cpython-312-pytest-9.0.2.pyc CHANGED Viewed

Binary files a/tests/__pycache__/test_smoke.cpython-312-pytest-9.0.2.pyc and b/tests/__pycache__/test_smoke.cpython-312-pytest-9.0.2.pyc differ

tests/test_smoke.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from pathlib import Path
 from app import process_inputs
@@ -11,3 +13,6 @@ def test_process_inputs_creates_csv(tmp_path: Path) -> None:
     assert Path(csv_path).exists()
     assert "Processed" in summary

 from pathlib import Path
+import csv
 from app import process_inputs
     assert Path(csv_path).exists()
     assert "Processed" in summary
+    with open(csv_path, newline="", encoding="utf-8") as handle:
+        rows = list(csv.reader(handle))
+    assert len(rows) >= 7