File size: 2,070 Bytes
54bef2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# app/services/processing.py
import re

def clean_text(text: str) -> str:
    """
    Clean transcript text by removing:
    - Timestamp markers like {ts:123}
    - Extra whitespace, line breaks
    - Special characters and formatting artifacts
    - Music/sound effect markers like [संगीत], [Music]
    """
    if not text:
        return ""
    
    # Remove timestamp markers: {ts:123}, {ts:0}, etc.
    text = re.sub(r'\{ts:\d+\}', '', text)
    
    # Remove sound effect markers: [संगीत], [Music], [Applause], etc.
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove parentheses with metadata: (music), (laughing), etc.
    text = re.sub(r'\(.*?\)', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    
    # Replace multiple line breaks with space
    text = text.replace('\n', ' ')
    
    # Remove extra whitespace (multiple spaces to single space)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
    """
    Split text into chunks with overlap for better context preservation.
    
    Args:
        text: Cleaned text to chunk
        chunk_size: Number of words per chunk (default: 500)
        overlap: Number of overlapping words between chunks (default: 50)
    
    Returns:
        List of text chunks with overlap
    """
    if not text:
        return []
    
    words = text.split()
    
    # If text is smaller than chunk_size, return as single chunk
    if len(words) <= chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(words):
        # Get chunk of words
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        
        # Move start position with overlap
        start = end - overlap
        
        # Prevent infinite loop if we're at the end
        if end >= len(words):
            break
    
    return chunks