Spaces:
Paused
Paused
Upload app.py
Browse files
app.py
CHANGED
|
@@ -31,6 +31,7 @@ LOCAL_CORPUS_DIR = Path(r"C:\4 Sem Project\Phase 2\phase 2 corpus")
|
|
| 31 |
CORPUS_CANDIDATES = [
|
| 32 |
APP_ROOT / "phase2_corpus",
|
| 33 |
APP_ROOT / "phase 2 corpus",
|
|
|
|
| 34 |
LOCAL_CORPUS_DIR,
|
| 35 |
]
|
| 36 |
|
|
@@ -1034,22 +1035,43 @@ def parse_vmp_table(page_records: List[PageRecord]) -> Dict[str, Dict[str, str]]
|
|
| 1034 |
return rows
|
| 1035 |
|
| 1036 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1037 |
def resolve_corpus_dir() -> Path:
|
| 1038 |
for candidate in CORPUS_CANDIDATES:
|
| 1039 |
-
if
|
| 1040 |
return candidate
|
| 1041 |
-
raise FileNotFoundError(
|
|
|
|
|
|
|
| 1042 |
|
| 1043 |
|
| 1044 |
def build_corpus_session() -> SessionData:
|
| 1045 |
corpus_dir = resolve_corpus_dir()
|
|
|
|
| 1046 |
page_records: List[PageRecord] = []
|
| 1047 |
chunks: List[Chunk] = []
|
| 1048 |
structured: Dict[str, dict] = {"headers": {}, "vmp_table": {}, "corpus_dir": str(corpus_dir)}
|
| 1049 |
extractors = set()
|
| 1050 |
file_hash_parts: List[str] = []
|
| 1051 |
|
| 1052 |
-
for pdf_path in
|
| 1053 |
file_name = pdf_path.name
|
| 1054 |
doc_pages, extractor_name = extract_page_records(str(pdf_path), file_name)
|
| 1055 |
extractors.add(extractor_name)
|
|
|
|
| 31 |
CORPUS_CANDIDATES = [
|
| 32 |
APP_ROOT / "phase2_corpus",
|
| 33 |
APP_ROOT / "phase 2 corpus",
|
| 34 |
+
APP_ROOT,
|
| 35 |
LOCAL_CORPUS_DIR,
|
| 36 |
]
|
| 37 |
|
|
|
|
| 1035 |
return rows
|
| 1036 |
|
| 1037 |
|
| 1038 |
+
def corpus_pdf_files(candidate: Path) -> List[Path]:
|
| 1039 |
+
if not candidate.exists() or not candidate.is_dir():
|
| 1040 |
+
return []
|
| 1041 |
+
pdfs = sorted(p for p in candidate.glob("*.pdf") if p.is_file())
|
| 1042 |
+
numbered = [p for p in pdfs if re.match(r"^\d{2}_.+\.pdf$", p.name)]
|
| 1043 |
+
required = {
|
| 1044 |
+
"01_Project_Charter.pdf",
|
| 1045 |
+
"02_Validation_Master_Plan.pdf",
|
| 1046 |
+
"15_Regulatory_Reference_Guide.pdf",
|
| 1047 |
+
}
|
| 1048 |
+
names = {p.name for p in numbered}
|
| 1049 |
+
if required.issubset(names):
|
| 1050 |
+
return numbered
|
| 1051 |
+
if len(numbered) >= 10:
|
| 1052 |
+
return numbered
|
| 1053 |
+
return []
|
| 1054 |
+
|
| 1055 |
+
|
| 1056 |
def resolve_corpus_dir() -> Path:
|
| 1057 |
for candidate in CORPUS_CANDIDATES:
|
| 1058 |
+
if corpus_pdf_files(candidate):
|
| 1059 |
return candidate
|
| 1060 |
+
raise FileNotFoundError(
|
| 1061 |
+
"Phase 2 corpus not found. Upload the 15 PDF files either into a phase2_corpus folder in the app repo or at the repo root."
|
| 1062 |
+
)
|
| 1063 |
|
| 1064 |
|
| 1065 |
def build_corpus_session() -> SessionData:
|
| 1066 |
corpus_dir = resolve_corpus_dir()
|
| 1067 |
+
pdf_paths = corpus_pdf_files(corpus_dir)
|
| 1068 |
page_records: List[PageRecord] = []
|
| 1069 |
chunks: List[Chunk] = []
|
| 1070 |
structured: Dict[str, dict] = {"headers": {}, "vmp_table": {}, "corpus_dir": str(corpus_dir)}
|
| 1071 |
extractors = set()
|
| 1072 |
file_hash_parts: List[str] = []
|
| 1073 |
|
| 1074 |
+
for pdf_path in pdf_paths:
|
| 1075 |
file_name = pdf_path.name
|
| 1076 |
doc_pages, extractor_name = extract_page_records(str(pdf_path), file_name)
|
| 1077 |
extractors.add(extractor_name)
|