Spaces:

devjhawar
/

KLypse

Sleeping

DEVJHAWAR11

Deploy Klypse backend

54bef2f 7 months ago

2.07 kB

	# app/services/processing.py
	import re

	def clean_text(text: str) -> str:
	"""
	Clean transcript text by removing:
	- Timestamp markers like {ts:123}
	- Extra whitespace, line breaks
	- Special characters and formatting artifacts
	- Music/sound effect markers like [संगीत], [Music]
	"""
	if not text:
	return ""

	# Remove timestamp markers: {ts:123}, {ts:0}, etc.
	text = re.sub(r'\{ts:\d+\}', '', text)

	# Remove sound effect markers: [संगीत], [Music], [Applause], etc.
	text = re.sub(r'\[.*?\]', '', text)

	# Remove parentheses with metadata: (music), (laughing), etc.
	text = re.sub(r'\(.*?\)', '', text)

	# Remove URLs
	text = re.sub(r'http[s]?://\S+', '', text)

	# Replace multiple line breaks with space
	text = text.replace('\n', ' ')

	# Remove extra whitespace (multiple spaces to single space)
	text = re.sub(r'\s+', ' ', text)

	# Remove leading/trailing whitespace
	text = text.strip()

	return text

	def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
	"""
	Split text into chunks with overlap for better context preservation.

	Args:
	text: Cleaned text to chunk
	chunk_size: Number of words per chunk (default: 500)
	overlap: Number of overlapping words between chunks (default: 50)

	Returns:
	List of text chunks with overlap
	"""
	if not text:
	return []

	words = text.split()

	# If text is smaller than chunk_size, return as single chunk
	if len(words) <= chunk_size:
	return [text]

	chunks = []
	start = 0

	while start < len(words):
	# Get chunk of words
	end = start + chunk_size
	chunk = " ".join(words[start:end])
	chunks.append(chunk)

	# Move start position with overlap
	start = end - overlap

	# Prevent infinite loop if we're at the end
	if end >= len(words):
	break

	return chunks