PromiseTrack-AI / pipelines /text /extract_text_data.py
Harisri
Initial deployment: PromiseTrack AI with 15 companies
80b6680
#!/usr/bin/env python3
"""
extract_text_data.py
Handles structure: CompanyName / YearFolder / PDFs directly inside
Assigns recency weights to year folders automatically.
"""
import re
from pathlib import Path
from typing import Optional
import pdfplumber
def _parse_year_from_folder(folder_name: str) -> Optional[int]:
"""Extract ending year from folder names like FY 24-25, TATA_23-24, INFOYSIS_24."""
m = re.search(r'(\d{2})-(\d{2})', folder_name)
if m:
return 2000 + int(m.group(2))
m = re.search(r'[_\s](\d{2,4})$', folder_name)
if m:
y = int(m.group(1))
return y if y > 2000 else 2000 + y
m = re.search(r'(20\d{2})', folder_name)
if m:
return int(m.group(1))
return None
def assign_year_weights(year_folders: list) -> dict:
"""Most recent year β†’ 1.0, each older β†’ -0.25, floor 0.1."""
sorted_f = sorted(year_folders, key=lambda x: x[1], reverse=True)
return {folder: round(max(1.0 - rank * 0.25, 0.1), 2)
for rank, (folder, _) in enumerate(sorted_f)}
def _extract_quarter(filename: str, year: Optional[int]) -> str:
m = re.search(r'[Qq]([1-4])', filename)
if m and year:
return f"{year}-Q{m.group(1)}"
return f"{year}-Unknown" if year else "Unknown"
def _clean_text(text: str) -> str:
if not text:
return ""
text = re.sub(r'\r\n|\r', '\n', text)
lines = [l for l in text.splitlines()
if not re.fullmatch(r'[\d\s\.\,\|\-\%\(\)]+', l.strip())
and len(l.strip()) > 3]
return "\n".join(lines)
def process_single_file(file_path: Path, company_name: str = "Unknown",
quarter: str = "Unknown", weight: float = 1.0) -> dict:
suffix = file_path.suffix.lower()
text = ""
if suffix == ".pdf":
try:
with pdfplumber.open(file_path) as pdf:
pages = [_clean_text(p.extract_text()) for p in pdf.pages]
text = "\n".join(p for p in pages if p.strip())
except Exception as e:
return {"error": str(e)}
elif suffix == ".txt":
for enc in ("utf-8", "latin-1"):
try:
text = _clean_text(file_path.read_text(encoding=enc))
break
except Exception:
continue
if not text.strip():
return {"error": "No text extracted"}
return {"company": company_name, "quarter": quarter, "weight": weight,
"source": "call_transcript", "raw_text": text, "filename": file_path.name}
def run_text_extraction_pipeline(companies_dir_path: str) -> list:
"""
Scans: companies_dir/<Company>/<YearFolder>/*.pdf|txt
Most recent year folder β†’ weight 1.0, older β†’ decreasing weights.
"""
companies_dir = Path(companies_dir_path)
if not companies_dir.exists():
raise FileNotFoundError(f"Not found: {companies_dir}")
records = []
for company_dir in sorted(companies_dir.iterdir()):
if not company_dir.is_dir():
continue
company_name = company_dir.name
# Collect year subfolders
year_folders = []
for sub in company_dir.iterdir():
if not sub.is_dir():
continue
year = _parse_year_from_folder(sub.name)
if year:
year_folders.append((sub, year))
# Fallback: PDFs directly in company folder
if not year_folders:
year_folders = [(company_dir, 2024)]
weight_map = assign_year_weights(year_folders)
for folder, year in year_folders:
weight = weight_map.get(folder, 0.1)
for file in sorted(folder.glob("*")):
if file.suffix.lower() not in (".pdf", ".txt"):
continue
result = process_single_file(
file, company_name, _extract_quarter(file.name, year), weight
)
if "error" not in result:
records.append(result)
return records