Spaces:

rishach
/

ai_flashcard_generator

Running

ai_flashcard_generator / src /flashcard_generator /text_processing.py

pranshu dhiman

Initial commit with Docker and Streamlit

46b701f 23 days ago

2.37 kB

	from __future__ import annotations

	import re
	from collections.abc import Iterable


	_WHITESPACE_RE = re.compile(r"\s+")


	def clean_text(text: str) -> str:
	"""Normalize extracted lecture-note text without removing useful punctuation."""
	text = text.replace("\x00", " ")
	text = re.sub(r"-\s\n\s", "", text)
	text = text.replace("\n", " ")
	text = _WHITESPACE_RE.sub(" ", text)
	return text.strip()


	def token_count(text: str) -> int:
	return len(text.split())


	def split_into_chunks(text: str, min_tokens: int = 300, max_tokens: int = 500) -> list[str]:
	"""Split text into roughly 300-500 token chunks using sentence boundaries."""
	cleaned = clean_text(text)
	if not cleaned:
	return []

	sentences = re.split(r"(?<=[.!?])\s+", cleaned)
	chunks: list[str] = []
	current: list[str] = []
	current_tokens = 0

	for sentence in sentences:
	words = sentence.split()
	if not words:
	continue

	if len(words) > max_tokens:
	if current:
	chunks.append(" ".join(current).strip())
	current = []
	current_tokens = 0
	chunks.extend(_split_long_sentence(words, max_tokens))
	continue

	would_exceed = current_tokens + len(words) > max_tokens
	can_close = current_tokens >= min_tokens
	if current and would_exceed and can_close:
	chunks.append(" ".join(current).strip())
	current = [sentence]
	current_tokens = len(words)
	else:
	current.append(sentence)
	current_tokens += len(words)

	if current:
	tail = " ".join(current).strip()
	if chunks and token_count(tail) < min_tokens // 2:
	chunks[-1] = f"{chunks[-1]} {tail}".strip()
	else:
	chunks.append(tail)

	return [chunk for chunk in chunks if chunk]


	def _split_long_sentence(words: Iterable[str], max_tokens: int) -> list[str]:
	word_list = list(words)
	return [
	" ".join(word_list[index : index + max_tokens]).strip()
	for index in range(0, len(word_list), max_tokens)
	]


	def first_sentences(text: str, limit: int = 3) -> str:
	sentences = re.split(r"(?<=[.!?])\s+", clean_text(text))
	selected = [sentence for sentence in sentences if sentence][:limit]
	return " ".join(selected).strip()