Nick021402 commited on
Commit
7456b32
·
verified ·
1 Parent(s): 27a3d23

Create segmenter.py

Browse files
Files changed (1) hide show
  1. segmenter.py +144 -0
segmenter.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # segmenter.py - Text segmentation and speaker assignment
2
+ import re
3
+ from typing import List, Tuple
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class TextSegmenter:
9
+ def __init__(self):
10
+ self.speakers = ["speaker1", "speaker2", "speaker3", "speaker4"]
11
+ self.current_speaker_index = 0
12
+
13
+ def segment_and_assign_speakers(
14
+ self,
15
+ text: str,
16
+ mode: str = "auto"
17
+ ) -> List[Tuple[str, str]]:
18
+ """
19
+ Segment text and assign speakers.
20
+
21
+ Args:
22
+ text: Input text to segment
23
+ mode: Segmentation mode ("auto", "paragraph", "dialogue")
24
+
25
+ Returns:
26
+ List of (speaker, text) tuples
27
+ """
28
+ if mode == "paragraph":
29
+ return self._segment_by_paragraphs(text)
30
+ elif mode == "dialogue":
31
+ return self._segment_by_dialogue(text)
32
+ else: # auto mode
33
+ return self._segment_auto(text)
34
+
35
+ def _segment_by_paragraphs(self, text: str) -> List[Tuple[str, str]]:
36
+ """Segment by paragraphs, alternating speakers."""
37
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
38
+ segments = []
39
+
40
+ for i, paragraph in enumerate(paragraphs):
41
+ speaker = self.speakers[i % len(self.speakers)]
42
+ segments.append((speaker, paragraph))
43
+
44
+ return segments
45
+
46
+ def _segment_by_dialogue(self, text: str) -> List[Tuple[str, str]]:
47
+ """Segment by detecting dialogue patterns."""
48
+ # Look for dialogue markers like quotes, dashes, etc.
49
+ lines = text.split('\n')
50
+ segments = []
51
+ current_segment = []
52
+ current_speaker = self.speakers[0]
53
+
54
+ for line in lines:
55
+ line = line.strip()
56
+ if not line:
57
+ continue
58
+
59
+ # Check for dialogue markers
60
+ if (line.startswith('"') or line.startswith("'") or
61
+ line.startswith('-') or line.startswith('—')):
62
+
63
+ # Save previous segment
64
+ if current_segment:
65
+ segments.append((current_speaker, ' '.join(current_segment)))
66
+
67
+ # Switch speaker and start new segment
68
+ self.current_speaker_index = (self.current_speaker_index + 1) % len(self.speakers)
69
+ current_speaker = self.speakers[self.current_speaker_index]
70
+ current_segment = [line]
71
+ else:
72
+ current_segment.append(line)
73
+
74
+ # Add final segment
75
+ if current_segment:
76
+ segments.append((current_speaker, ' '.join(current_segment)))
77
+
78
+ return segments
79
+
80
+ def _segment_auto(self, text: str) -> List[Tuple[str, str]]:
81
+ """Automatic segmentation using multiple heuristics."""
82
+ # Try to detect natural breaks
83
+ segments = []
84
+
85
+ # Split by double newlines first (paragraphs)
86
+ paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
87
+
88
+ if len(paragraphs) > 1:
89
+ # Use paragraph-based segmentation
90
+ return self._segment_by_paragraphs(text)
91
+
92
+ # Fall back to sentence-based segmentation for long text
93
+ sentences = self._split_into_sentences(text)
94
+ if len(sentences) > 10:
95
+ return self._segment_by_sentence_groups(sentences)
96
+
97
+ # For short text, just alternate every few sentences
98
+ return self._segment_simple(text)
99
+
100
+ def _split_into_sentences(self, text: str) -> List[str]:
101
+ """Split text into sentences."""
102
+ # Simple sentence splitting
103
+ sentences = re.split(r'[.!?]+', text)
104
+ return [s.strip() for s in sentences if s.strip()]
105
+
106
+ def _segment_by_sentence_groups(self, sentences: List[str]) -> List[Tuple[str, str]]:
107
+ """Group sentences and assign to different speakers."""
108
+ segments = []
109
+ group_size = max(2, len(sentences) // 8) # Aim for reasonable segment sizes
110
+
111
+ for i in range(0, len(sentences), group_size):
112
+ group = sentences[i:i + group_size]
113
+ speaker = self.speakers[i // group_size % len(self.speakers)]
114
+ text_segment = '. '.join(group) + '.'
115
+ segments.append((speaker, text_segment))
116
+
117
+ return segments
118
+
119
+ def _segment_simple(self, text: str) -> List[Tuple[str, str]]:
120
+ """Simple segmentation for short texts."""
121
+ # Just split roughly in half or thirds
122
+ words = text.split()
123
+ total_words = len(words)
124
+
125
+ if total_words < 50:
126
+ # Too short to split meaningfully
127
+ return [("speaker1", text)]
128
+
129
+ # Split into 2-3 segments
130
+ num_segments = min(3, max(2, total_words // 100))
131
+ segment_size = total_words // num_segments
132
+
133
+ segments = []
134
+ for i in range(num_segments):
135
+ start_idx = i * segment_size
136
+ end_idx = (i + 1) * segment_size if i < num_segments - 1 else total_words
137
+
138
+ segment_words = words[start_idx:end_idx]
139
+ segment_text = ' '.join(segment_words)
140
+ speaker = self.speakers[i % len(self.speakers)]
141
+
142
+ segments.append((speaker, segment_text))
143
+
144
+ return segments