File size: 3,054 Bytes
80b6680
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
"""
split_sentences.py
Splits raw transcript text into clean sentences using nltk.sent_tokenize.
Output: sentence_data.csv  (company, quarter, source, sentence)
"""

import re
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize

# --- Resource Initialization ---
def _ensure_nltk_resources():
    """Quietly ensures necessary NLTK models are downloaded."""
    for resource in ["tokenizers/punkt", "tokenizers/punkt_tab"]:
        try:
            nltk.data.find(resource)
        except LookupError:
            nltk.download(resource.split('/')[-1], quiet=True)

_ensure_nltk_resources()

# --- Garbage Patterns (Constants) ---
_GARBAGE_RE = re.compile(
    r'^[\d\s\.\,\|\-\%\(\)\$\₹\/\:]+$'   # purely numeric / symbolic
    r'|^\s*[A-Z\s]{1,6}\s*$'              # ALL-CAPS short header (≤6 words)
    r'|www\.|http|©|™|®',                 # URLs / legal symbols
    re.IGNORECASE,
)

# --- Internal Helpers ---

def _clean_sentence(s: str) -> str:
    """Normalize whitespace and strip newlines."""
    s = re.sub(r'[\r\n\t]+', ' ', s)
    s = re.sub(r'\s{2,}', ' ', s)
    return s.strip()

def _is_valid_sentence(s: str) -> bool:
    """Return True if the sentence is worth keeping (length/content check)."""
    if len(s) < 20:
        return False
    if _GARBAGE_RE.search(s):
        return False
    # Must contain at least three alphabetic words
    words = re.findall(r'[a-zA-Z]{2,}', s)
    return len(words) >= 3

# --- Core Callable Functions ---

def process_text_into_sentences(text: str) -> list[str]:
    """
    Takes a raw string and returns a list of cleaned, validated sentences.
    Useful for processing single inputs from a web form.
    """
    if not text or pd.isna(text):
        return []
    
    raw_sentences = sent_tokenize(str(text))
    valid_sentences = []
    
    for s in raw_sentences:
        cleaned = _clean_sentence(s)
        if _is_valid_sentence(cleaned):
            valid_sentences.append(cleaned)
            
    return valid_sentences

def run_sentence_splitting_pipeline(extracted_records: list[dict]) -> list[dict]:
    """
    The main entry point for the batch pipeline.
    Input: List of dicts from 'run_text_extraction_pipeline'.
    Output: List of dicts where each record is a single sentence.
    """
    sentence_records = []
    
    for record in extracted_records:
        raw_text = record.get("raw_text", "")
        sentences = process_text_into_sentences(raw_text)
        
        for s in sentences:
            sentence_records.append({
                "company":  record.get("company"),
                "quarter":  record.get("quarter"),
                "source":   record.get("source"),
                "sentence": s,
            })

    # Drop duplicates naturally in the list before returning
    seen = set()
    unique_records = []
    for rec in sentence_records:
        if rec["sentence"] not in seen:
            unique_records.append(rec)
            seen.add(rec["sentence"])
            
    return unique_records