Spaces:

aravindkb7
/

SourceTruth_Test

Paused

App Files Files Community

aravindkb7 commited on 22 days ago

Commit

06c7f52

verified ·

1 Parent(s): f31967a

Upload app.py

Browse files

Files changed (1) hide show

app.py +25 -3

app.py CHANGED Viewed

@@ -31,6 +31,7 @@ LOCAL_CORPUS_DIR = Path(r"C:\4 Sem Project\Phase 2\phase 2 corpus")
 CORPUS_CANDIDATES = [
     APP_ROOT / "phase2_corpus",
     APP_ROOT / "phase 2 corpus",
     LOCAL_CORPUS_DIR,
 ]
@@ -1034,22 +1035,43 @@ def parse_vmp_table(page_records: List[PageRecord]) -> Dict[str, Dict[str, str]]
     return rows
 def resolve_corpus_dir() -> Path:
     for candidate in CORPUS_CANDIDATES:
-        if candidate.exists() and candidate.is_dir():
             return candidate
-    raise FileNotFoundError("Phase 2 corpus directory not found. Upload the PDF set into a phase2_corpus folder in the app repo.")
 def build_corpus_session() -> SessionData:
     corpus_dir = resolve_corpus_dir()
     page_records: List[PageRecord] = []
     chunks: List[Chunk] = []
     structured: Dict[str, dict] = {"headers": {}, "vmp_table": {}, "corpus_dir": str(corpus_dir)}
     extractors = set()
     file_hash_parts: List[str] = []
-    for pdf_path in sorted(corpus_dir.glob("*.pdf")):
         file_name = pdf_path.name
         doc_pages, extractor_name = extract_page_records(str(pdf_path), file_name)
         extractors.add(extractor_name)

 CORPUS_CANDIDATES = [
     APP_ROOT / "phase2_corpus",
     APP_ROOT / "phase 2 corpus",
+    APP_ROOT,
     LOCAL_CORPUS_DIR,
 ]
     return rows
+def corpus_pdf_files(candidate: Path) -> List[Path]:
+    if not candidate.exists() or not candidate.is_dir():
+        return []
+    pdfs = sorted(p for p in candidate.glob("*.pdf") if p.is_file())
+    numbered = [p for p in pdfs if re.match(r"^\d{2}_.+\.pdf$", p.name)]
+    required = {
+        "01_Project_Charter.pdf",
+        "02_Validation_Master_Plan.pdf",
+        "15_Regulatory_Reference_Guide.pdf",
+    }
+    names = {p.name for p in numbered}
+    if required.issubset(names):
+        return numbered
+    if len(numbered) >= 10:
+        return numbered
+    return []
 def resolve_corpus_dir() -> Path:
     for candidate in CORPUS_CANDIDATES:
+        if corpus_pdf_files(candidate):
             return candidate
+    raise FileNotFoundError(
+        "Phase 2 corpus not found. Upload the 15 PDF files either into a phase2_corpus folder in the app repo or at the repo root."
+    )
 def build_corpus_session() -> SessionData:
     corpus_dir = resolve_corpus_dir()
+    pdf_paths = corpus_pdf_files(corpus_dir)
     page_records: List[PageRecord] = []
     chunks: List[Chunk] = []
     structured: Dict[str, dict] = {"headers": {}, "vmp_table": {}, "corpus_dir": str(corpus_dir)}
     extractors = set()
     file_hash_parts: List[str] = []
+    for pdf_path in pdf_paths:
         file_name = pdf_path.name
         doc_pages, extractor_name = extract_page_records(str(pdf_path), file_name)
         extractors.add(extractor_name)