File size: 4,018 Bytes
80b6680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
"""
extract_text_data.py
Handles structure: CompanyName / YearFolder / PDFs directly inside
Assigns recency weights to year folders automatically.
"""

import re
from pathlib import Path
from typing import Optional
import pdfplumber


def _parse_year_from_folder(folder_name: str) -> Optional[int]:
    """Extract ending year from folder names like FY 24-25, TATA_23-24, INFOYSIS_24."""
    m = re.search(r'(\d{2})-(\d{2})', folder_name)
    if m:
        return 2000 + int(m.group(2))
    m = re.search(r'[_\s](\d{2,4})$', folder_name)
    if m:
        y = int(m.group(1))
        return y if y > 2000 else 2000 + y
    m = re.search(r'(20\d{2})', folder_name)
    if m:
        return int(m.group(1))
    return None


def assign_year_weights(year_folders: list) -> dict:
    """Most recent year → 1.0, each older → -0.25, floor 0.1."""
    sorted_f = sorted(year_folders, key=lambda x: x[1], reverse=True)
    return {folder: round(max(1.0 - rank * 0.25, 0.1), 2)
            for rank, (folder, _) in enumerate(sorted_f)}


def _extract_quarter(filename: str, year: Optional[int]) -> str:
    m = re.search(r'[Qq]([1-4])', filename)
    if m and year:
        return f"{year}-Q{m.group(1)}"
    return f"{year}-Unknown" if year else "Unknown"


def _clean_text(text: str) -> str:
    if not text:
        return ""
    text = re.sub(r'\r\n|\r', '\n', text)
    lines = [l for l in text.splitlines()
             if not re.fullmatch(r'[\d\s\.\,\|\-\%\(\)]+', l.strip())
             and len(l.strip()) > 3]
    return "\n".join(lines)


def process_single_file(file_path: Path, company_name: str = "Unknown",
                         quarter: str = "Unknown", weight: float = 1.0) -> dict:
    suffix = file_path.suffix.lower()
    text = ""
    if suffix == ".pdf":
        try:
            with pdfplumber.open(file_path) as pdf:
                pages = [_clean_text(p.extract_text()) for p in pdf.pages]
            text = "\n".join(p for p in pages if p.strip())
        except Exception as e:
            return {"error": str(e)}
    elif suffix == ".txt":
        for enc in ("utf-8", "latin-1"):
            try:
                text = _clean_text(file_path.read_text(encoding=enc))
                break
            except Exception:
                continue
    if not text.strip():
        return {"error": "No text extracted"}
    return {"company": company_name, "quarter": quarter, "weight": weight,
            "source": "call_transcript", "raw_text": text, "filename": file_path.name}


def run_text_extraction_pipeline(companies_dir_path: str) -> list:
    """
    Scans: companies_dir/<Company>/<YearFolder>/*.pdf|txt
    Most recent year folder → weight 1.0, older → decreasing weights.
    """
    companies_dir = Path(companies_dir_path)
    if not companies_dir.exists():
        raise FileNotFoundError(f"Not found: {companies_dir}")

    records = []
    for company_dir in sorted(companies_dir.iterdir()):
        if not company_dir.is_dir():
            continue
        company_name = company_dir.name

        # Collect year subfolders
        year_folders = []
        for sub in company_dir.iterdir():
            if not sub.is_dir():
                continue
            year = _parse_year_from_folder(sub.name)
            if year:
                year_folders.append((sub, year))

        # Fallback: PDFs directly in company folder
        if not year_folders:
            year_folders = [(company_dir, 2024)]

        weight_map = assign_year_weights(year_folders)

        for folder, year in year_folders:
            weight = weight_map.get(folder, 0.1)
            for file in sorted(folder.glob("*")):
                if file.suffix.lower() not in (".pdf", ".txt"):
                    continue
                result = process_single_file(
                    file, company_name, _extract_quarter(file.name, year), weight
                )
                if "error" not in result:
                    records.append(result)

    return records