Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| split_sentences.py | |
| Splits raw transcript text into clean sentences using nltk.sent_tokenize. | |
| Output: sentence_data.csv (company, quarter, source, sentence) | |
| """ | |
| import re | |
| import nltk | |
| import pandas as pd | |
| from nltk.tokenize import sent_tokenize | |
| # --- Resource Initialization --- | |
| def _ensure_nltk_resources(): | |
| """Quietly ensures necessary NLTK models are downloaded.""" | |
| for resource in ["tokenizers/punkt", "tokenizers/punkt_tab"]: | |
| try: | |
| nltk.data.find(resource) | |
| except LookupError: | |
| nltk.download(resource.split('/')[-1], quiet=True) | |
| _ensure_nltk_resources() | |
| # --- Garbage Patterns (Constants) --- | |
| _GARBAGE_RE = re.compile( | |
| r'^[\d\s\.\,\|\-\%\(\)\$\₹\/\:]+$' # purely numeric / symbolic | |
| r'|^\s*[A-Z\s]{1,6}\s*$' # ALL-CAPS short header (≤6 words) | |
| r'|www\.|http|©|™|®', # URLs / legal symbols | |
| re.IGNORECASE, | |
| ) | |
| # --- Internal Helpers --- | |
| def _clean_sentence(s: str) -> str: | |
| """Normalize whitespace and strip newlines.""" | |
| s = re.sub(r'[\r\n\t]+', ' ', s) | |
| s = re.sub(r'\s{2,}', ' ', s) | |
| return s.strip() | |
| def _is_valid_sentence(s: str) -> bool: | |
| """Return True if the sentence is worth keeping (length/content check).""" | |
| if len(s) < 20: | |
| return False | |
| if _GARBAGE_RE.search(s): | |
| return False | |
| # Must contain at least three alphabetic words | |
| words = re.findall(r'[a-zA-Z]{2,}', s) | |
| return len(words) >= 3 | |
| # --- Core Callable Functions --- | |
| def process_text_into_sentences(text: str) -> list[str]: | |
| """ | |
| Takes a raw string and returns a list of cleaned, validated sentences. | |
| Useful for processing single inputs from a web form. | |
| """ | |
| if not text or pd.isna(text): | |
| return [] | |
| raw_sentences = sent_tokenize(str(text)) | |
| valid_sentences = [] | |
| for s in raw_sentences: | |
| cleaned = _clean_sentence(s) | |
| if _is_valid_sentence(cleaned): | |
| valid_sentences.append(cleaned) | |
| return valid_sentences | |
| def run_sentence_splitting_pipeline(extracted_records: list[dict]) -> list[dict]: | |
| """ | |
| The main entry point for the batch pipeline. | |
| Input: List of dicts from 'run_text_extraction_pipeline'. | |
| Output: List of dicts where each record is a single sentence. | |
| """ | |
| sentence_records = [] | |
| for record in extracted_records: | |
| raw_text = record.get("raw_text", "") | |
| sentences = process_text_into_sentences(raw_text) | |
| for s in sentences: | |
| sentence_records.append({ | |
| "company": record.get("company"), | |
| "quarter": record.get("quarter"), | |
| "source": record.get("source"), | |
| "sentence": s, | |
| }) | |
| # Drop duplicates naturally in the list before returning | |
| seen = set() | |
| unique_records = [] | |
| for rec in sentence_records: | |
| if rec["sentence"] not in seen: | |
| unique_records.append(rec) | |
| seen.add(rec["sentence"]) | |
| return unique_records |