PromiseTrack-AI / pipelines /text /split_sentences.py
Harisri
Initial deployment: PromiseTrack AI with 15 companies
80b6680
#!/usr/bin/env python3
"""
split_sentences.py
Splits raw transcript text into clean sentences using nltk.sent_tokenize.
Output: sentence_data.csv (company, quarter, source, sentence)
"""
import re
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize
# --- Resource Initialization ---
def _ensure_nltk_resources():
"""Quietly ensures necessary NLTK models are downloaded."""
for resource in ["tokenizers/punkt", "tokenizers/punkt_tab"]:
try:
nltk.data.find(resource)
except LookupError:
nltk.download(resource.split('/')[-1], quiet=True)
_ensure_nltk_resources()
# --- Garbage Patterns (Constants) ---
_GARBAGE_RE = re.compile(
r'^[\d\s\.\,\|\-\%\(\)\$\₹\/\:]+$' # purely numeric / symbolic
r'|^\s*[A-Z\s]{1,6}\s*$' # ALL-CAPS short header (≤6 words)
r'|www\.|http|©|™|®', # URLs / legal symbols
re.IGNORECASE,
)
# --- Internal Helpers ---
def _clean_sentence(s: str) -> str:
"""Normalize whitespace and strip newlines."""
s = re.sub(r'[\r\n\t]+', ' ', s)
s = re.sub(r'\s{2,}', ' ', s)
return s.strip()
def _is_valid_sentence(s: str) -> bool:
"""Return True if the sentence is worth keeping (length/content check)."""
if len(s) < 20:
return False
if _GARBAGE_RE.search(s):
return False
# Must contain at least three alphabetic words
words = re.findall(r'[a-zA-Z]{2,}', s)
return len(words) >= 3
# --- Core Callable Functions ---
def process_text_into_sentences(text: str) -> list[str]:
"""
Takes a raw string and returns a list of cleaned, validated sentences.
Useful for processing single inputs from a web form.
"""
if not text or pd.isna(text):
return []
raw_sentences = sent_tokenize(str(text))
valid_sentences = []
for s in raw_sentences:
cleaned = _clean_sentence(s)
if _is_valid_sentence(cleaned):
valid_sentences.append(cleaned)
return valid_sentences
def run_sentence_splitting_pipeline(extracted_records: list[dict]) -> list[dict]:
"""
The main entry point for the batch pipeline.
Input: List of dicts from 'run_text_extraction_pipeline'.
Output: List of dicts where each record is a single sentence.
"""
sentence_records = []
for record in extracted_records:
raw_text = record.get("raw_text", "")
sentences = process_text_into_sentences(raw_text)
for s in sentences:
sentence_records.append({
"company": record.get("company"),
"quarter": record.get("quarter"),
"source": record.get("source"),
"sentence": s,
})
# Drop duplicates naturally in the list before returning
seen = set()
unique_records = []
for rec in sentence_records:
if rec["sentence"] not in seen:
unique_records.append(rec)
seen.add(rec["sentence"])
return unique_records