Spaces:

Harisri
/

PromiseTrack-AI

Sleeping

PromiseTrack-AI / pipelines /text /split_sentences.py

Harisri

Initial deployment: PromiseTrack AI with 15 companies

80b6680 about 2 months ago

3.05 kB

	#!/usr/bin/env python3
	"""
	split_sentences.py
	Splits raw transcript text into clean sentences using nltk.sent_tokenize.
	Output: sentence_data.csv (company, quarter, source, sentence)
	"""

	import re
	import nltk
	import pandas as pd
	from nltk.tokenize import sent_tokenize

	# --- Resource Initialization ---
	def _ensure_nltk_resources():
	"""Quietly ensures necessary NLTK models are downloaded."""
	for resource in ["tokenizers/punkt", "tokenizers/punkt_tab"]:
	try:
	nltk.data.find(resource)
	except LookupError:
	nltk.download(resource.split('/')[-1], quiet=True)

	_ensure_nltk_resources()

	# --- Garbage Patterns (Constants) ---
	_GARBAGE_RE = re.compile(
	r'^[\d\s\.\,\\|\-\%\$\₹\/\:]+$' # purely numeric / symbolic
	r'\|^\s[A-Z\s]{1,6}\s$' # ALL-CAPS short header (≤6 words)
	r'\|www\.\|http\|©\|™\|®', # URLs / legal symbols
	re.IGNORECASE,
	)

	# --- Internal Helpers ---

	def _clean_sentence(s: str) -> str:
	"""Normalize whitespace and strip newlines."""
	s = re.sub(r'[\r\n\t]+', ' ', s)
	s = re.sub(r'\s{2,}', ' ', s)
	return s.strip()

	def _is_valid_sentence(s: str) -> bool:
	"""Return True if the sentence is worth keeping (length/content check)."""
	if len(s) < 20:
	return False
	if _GARBAGE_RE.search(s):
	return False
	# Must contain at least three alphabetic words
	words = re.findall(r'[a-zA-Z]{2,}', s)
	return len(words) >= 3

	# --- Core Callable Functions ---

	def process_text_into_sentences(text: str) -> list[str]:
	"""
	Takes a raw string and returns a list of cleaned, validated sentences.
	Useful for processing single inputs from a web form.
	"""
	if not text or pd.isna(text):
	return []

	raw_sentences = sent_tokenize(str(text))
	valid_sentences = []

	for s in raw_sentences:
	cleaned = _clean_sentence(s)
	if _is_valid_sentence(cleaned):
	valid_sentences.append(cleaned)

	return valid_sentences

	def run_sentence_splitting_pipeline(extracted_records: list[dict]) -> list[dict]:
	"""
	The main entry point for the batch pipeline.
	Input: List of dicts from 'run_text_extraction_pipeline'.
	Output: List of dicts where each record is a single sentence.
	"""
	sentence_records = []

	for record in extracted_records:
	raw_text = record.get("raw_text", "")
	sentences = process_text_into_sentences(raw_text)

	for s in sentences:
	sentence_records.append({
	"company": record.get("company"),
	"quarter": record.get("quarter"),
	"source": record.get("source"),
	"sentence": s,
	})

	# Drop duplicates naturally in the list before returning
	seen = set()
	unique_records = []
	for rec in sentence_records:
	if rec["sentence"] not in seen:
	unique_records.append(rec)
	seen.add(rec["sentence"])

	return unique_records