aravindkb7 commited on
Commit
06c7f52
·
verified ·
1 Parent(s): f31967a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -3
app.py CHANGED
@@ -31,6 +31,7 @@ LOCAL_CORPUS_DIR = Path(r"C:\4 Sem Project\Phase 2\phase 2 corpus")
31
  CORPUS_CANDIDATES = [
32
  APP_ROOT / "phase2_corpus",
33
  APP_ROOT / "phase 2 corpus",
 
34
  LOCAL_CORPUS_DIR,
35
  ]
36
 
@@ -1034,22 +1035,43 @@ def parse_vmp_table(page_records: List[PageRecord]) -> Dict[str, Dict[str, str]]
1034
  return rows
1035
 
1036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
  def resolve_corpus_dir() -> Path:
1038
  for candidate in CORPUS_CANDIDATES:
1039
- if candidate.exists() and candidate.is_dir():
1040
  return candidate
1041
- raise FileNotFoundError("Phase 2 corpus directory not found. Upload the PDF set into a phase2_corpus folder in the app repo.")
 
 
1042
 
1043
 
1044
  def build_corpus_session() -> SessionData:
1045
  corpus_dir = resolve_corpus_dir()
 
1046
  page_records: List[PageRecord] = []
1047
  chunks: List[Chunk] = []
1048
  structured: Dict[str, dict] = {"headers": {}, "vmp_table": {}, "corpus_dir": str(corpus_dir)}
1049
  extractors = set()
1050
  file_hash_parts: List[str] = []
1051
 
1052
- for pdf_path in sorted(corpus_dir.glob("*.pdf")):
1053
  file_name = pdf_path.name
1054
  doc_pages, extractor_name = extract_page_records(str(pdf_path), file_name)
1055
  extractors.add(extractor_name)
 
31
  CORPUS_CANDIDATES = [
32
  APP_ROOT / "phase2_corpus",
33
  APP_ROOT / "phase 2 corpus",
34
+ APP_ROOT,
35
  LOCAL_CORPUS_DIR,
36
  ]
37
 
 
1035
  return rows
1036
 
1037
 
1038
+ def corpus_pdf_files(candidate: Path) -> List[Path]:
1039
+ if not candidate.exists() or not candidate.is_dir():
1040
+ return []
1041
+ pdfs = sorted(p for p in candidate.glob("*.pdf") if p.is_file())
1042
+ numbered = [p for p in pdfs if re.match(r"^\d{2}_.+\.pdf$", p.name)]
1043
+ required = {
1044
+ "01_Project_Charter.pdf",
1045
+ "02_Validation_Master_Plan.pdf",
1046
+ "15_Regulatory_Reference_Guide.pdf",
1047
+ }
1048
+ names = {p.name for p in numbered}
1049
+ if required.issubset(names):
1050
+ return numbered
1051
+ if len(numbered) >= 10:
1052
+ return numbered
1053
+ return []
1054
+
1055
+
1056
  def resolve_corpus_dir() -> Path:
1057
  for candidate in CORPUS_CANDIDATES:
1058
+ if corpus_pdf_files(candidate):
1059
  return candidate
1060
+ raise FileNotFoundError(
1061
+ "Phase 2 corpus not found. Upload the 15 PDF files either into a phase2_corpus folder in the app repo or at the repo root."
1062
+ )
1063
 
1064
 
1065
  def build_corpus_session() -> SessionData:
1066
  corpus_dir = resolve_corpus_dir()
1067
+ pdf_paths = corpus_pdf_files(corpus_dir)
1068
  page_records: List[PageRecord] = []
1069
  chunks: List[Chunk] = []
1070
  structured: Dict[str, dict] = {"headers": {}, "vmp_table": {}, "corpus_dir": str(corpus_dir)}
1071
  extractors = set()
1072
  file_hash_parts: List[str] = []
1073
 
1074
+ for pdf_path in pdf_paths:
1075
  file_name = pdf_path.name
1076
  doc_pages, extractor_name = extract_page_records(str(pdf_path), file_name)
1077
  extractors.add(extractor_name)