Nick021402 commited on
Commit
b95442f
·
verified ·
1 Parent(s): 9f8d849

Update segmenter.py

Browse files
Files changed (1) hide show
  1. segmenter.py +11 -16
segmenter.py CHANGED
@@ -7,7 +7,8 @@ logger = logging.getLogger(__name__)
7
 
8
  class TextSegmenter:
9
  def __init__(self):
10
- self.speakers = ["speaker1", "speaker2", "speaker3", "speaker4"]
 
11
  self.current_speaker_index = 0
12
 
13
  def segment_and_assign_speakers(
@@ -45,11 +46,11 @@ class TextSegmenter:
45
 
46
  def _segment_by_dialogue(self, text: str) -> List[Tuple[str, str]]:
47
  """Segment by detecting dialogue patterns."""
48
- # Look for dialogue markers like quotes, dashes, etc.
49
  lines = text.split('\n')
50
  segments = []
51
  current_segment = []
52
- current_speaker = self.speakers[0]
 
53
 
54
  for line in lines:
55
  line = line.strip()
@@ -79,55 +80,49 @@ class TextSegmenter:
79
 
80
  def _segment_auto(self, text: str) -> List[Tuple[str, str]]:
81
  """Automatic segmentation using multiple heuristics."""
82
- # Try to detect natural breaks
83
  segments = []
84
 
85
- # Split by double newlines first (paragraphs)
86
  paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
87
 
88
  if len(paragraphs) > 1:
89
- # Use paragraph-based segmentation
90
  return self._segment_by_paragraphs(text)
91
 
92
- # Fall back to sentence-based segmentation for long text
93
  sentences = self._split_into_sentences(text)
94
  if len(sentences) > 10:
95
  return self._segment_by_sentence_groups(sentences)
96
 
97
- # For short text, just alternate every few sentences
98
  return self._segment_simple(text)
99
 
100
  def _split_into_sentences(self, text: str) -> List[str]:
101
  """Split text into sentences."""
102
  # Simple sentence splitting
103
- sentences = re.split(r'[.!?]+', text)
 
 
104
  return [s.strip() for s in sentences if s.strip()]
105
 
106
  def _segment_by_sentence_groups(self, sentences: List[str]) -> List[Tuple[str, str]]:
107
  """Group sentences and assign to different speakers."""
108
  segments = []
109
- group_size = max(2, len(sentences) // 8) # Aim for reasonable segment sizes
110
 
111
  for i in range(0, len(sentences), group_size):
112
  group = sentences[i:i + group_size]
113
  speaker = self.speakers[i // group_size % len(self.speakers)]
114
- text_segment = '. '.join(group) + '.'
115
  segments.append((speaker, text_segment))
116
 
117
  return segments
118
 
119
  def _segment_simple(self, text: str) -> List[Tuple[str, str]]:
120
  """Simple segmentation for short texts."""
121
- # Just split roughly in half or thirds
122
  words = text.split()
123
  total_words = len(words)
124
 
125
  if total_words < 50:
126
- # Too short to split meaningfully
127
- return [("speaker1", text)]
128
 
129
- # Split into 2-3 segments
130
- num_segments = min(3, max(2, total_words // 100))
131
  segment_size = total_words // num_segments
132
 
133
  segments = []
 
7
 
8
  class TextSegmenter:
9
  def __init__(self):
10
+ # Changed speakers to Nari DIA's expected tags
11
+ self.speakers = ["S1", "S2"]
12
  self.current_speaker_index = 0
13
 
14
  def segment_and_assign_speakers(
 
46
 
47
  def _segment_by_dialogue(self, text: str) -> List[Tuple[str, str]]:
48
  """Segment by detecting dialogue patterns."""
 
49
  lines = text.split('\n')
50
  segments = []
51
  current_segment = []
52
+ # Start with the first speaker in the list
53
+ current_speaker = self.speakers[0]
54
 
55
  for line in lines:
56
  line = line.strip()
 
80
 
81
  def _segment_auto(self, text: str) -> List[Tuple[str, str]]:
82
  """Automatic segmentation using multiple heuristics."""
 
83
  segments = []
84
 
 
85
  paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
86
 
87
  if len(paragraphs) > 1:
 
88
  return self._segment_by_paragraphs(text)
89
 
 
90
  sentences = self._split_into_sentences(text)
91
  if len(sentences) > 10:
92
  return self._segment_by_sentence_groups(sentences)
93
 
 
94
  return self._segment_simple(text)
95
 
96
  def _split_into_sentences(self, text: str) -> List[str]:
97
  """Split text into sentences."""
98
  # Simple sentence splitting
99
+ # Use a more robust regex to avoid splitting on abbreviations (e.g., "Mr.")
100
+ # This is a common simple improvement, though full NLP libraries are best for complex cases.
101
+ sentences = re.split(r'(?<=[.!?])\s+', text) # Split after . ! ? followed by space
102
  return [s.strip() for s in sentences if s.strip()]
103
 
104
  def _segment_by_sentence_groups(self, sentences: List[str]) -> List[Tuple[str, str]]:
105
  """Group sentences and assign to different speakers."""
106
  segments = []
107
+ group_size = max(2, len(sentences) // 8)
108
 
109
  for i in range(0, len(sentences), group_size):
110
  group = sentences[i:i + group_size]
111
  speaker = self.speakers[i // group_size % len(self.speakers)]
112
+ text_segment = ' '.join(group) # No need to add '.' if already present from sentence splitting
113
  segments.append((speaker, text_segment))
114
 
115
  return segments
116
 
117
  def _segment_simple(self, text: str) -> List[Tuple[str, str]]:
118
  """Simple segmentation for short texts."""
 
119
  words = text.split()
120
  total_words = len(words)
121
 
122
  if total_words < 50:
123
+ return [(self.speakers[0], text)] # Assign to S1
 
124
 
125
+ num_segments = min(len(self.speakers), max(2, total_words // 100)) # Limit segments by available speakers
 
126
  segment_size = total_words // num_segments
127
 
128
  segments = []