| |
| import re |
|
|
| def clean_text(text: str) -> str: |
| """ |
| Clean transcript text by removing: |
| - Timestamp markers like {ts:123} |
| - Extra whitespace, line breaks |
| - Special characters and formatting artifacts |
| - Music/sound effect markers like [संगीत], [Music] |
| """ |
| if not text: |
| return "" |
| |
| |
| text = re.sub(r'\{ts:\d+\}', '', text) |
| |
| |
| text = re.sub(r'\[.*?\]', '', text) |
| |
| |
| text = re.sub(r'\(.*?\)', '', text) |
| |
| |
| text = re.sub(r'http[s]?://\S+', '', text) |
| |
| |
| text = text.replace('\n', ' ') |
| |
| |
| text = re.sub(r'\s+', ' ', text) |
| |
| |
| text = text.strip() |
| |
| return text |
|
|
| def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]: |
| """ |
| Split text into chunks with overlap for better context preservation. |
| |
| Args: |
| text: Cleaned text to chunk |
| chunk_size: Number of words per chunk (default: 500) |
| overlap: Number of overlapping words between chunks (default: 50) |
| |
| Returns: |
| List of text chunks with overlap |
| """ |
| if not text: |
| return [] |
| |
| words = text.split() |
| |
| |
| if len(words) <= chunk_size: |
| return [text] |
| |
| chunks = [] |
| start = 0 |
| |
| while start < len(words): |
| |
| end = start + chunk_size |
| chunk = " ".join(words[start:end]) |
| chunks.append(chunk) |
| |
| |
| start = end - overlap |
| |
| |
| if end >= len(words): |
| break |
| |
| return chunks |
|
|