LiamKhoaLe commited on
Commit
a062909
·
1 Parent(s): 88e7ced

Rm convo agressively with regex-filter

Browse files
Files changed (2) hide show
  1. utils/cloud_llm.py +67 -5
  2. utils/local_llm.py +43 -4
utils/cloud_llm.py CHANGED
@@ -126,20 +126,82 @@ class Paraphraser:
126
  self.gm_hard = None # Disabled - only use easy model
127
  logger.info("Paraphraser initialized: NVIDIA -> GEMINI_EASY (GEMINI_HARD disabled)")
128
 
129
- # Regex-based cleaning resp from quotes
130
  def _clean_resp(self, resp: str) -> str:
131
  if not resp: return resp
132
  txt = resp.strip()
133
- # Remove common boilerplate prefixes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  for pat in [
135
  r"^Here is (a|the) .*?:\s*",
136
  r"^Paraphrased(?: version)?:\s*",
137
  r"^Sure[,.]?\s*",
138
- r"^Okay[,.]?\s*"
 
 
 
 
 
139
  ]:
140
- import re
141
  txt = re.sub(pat, "", txt, flags=re.I)
142
- return txt.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  # ————— Paraphrase —————
145
  def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
 
126
  self.gm_hard = None # Disabled - only use easy model
127
  logger.info("Paraphraser initialized: NVIDIA -> GEMINI_EASY (GEMINI_HARD disabled)")
128
 
129
+ # Enhanced cleaning to remove conversational elements and comments
130
  def _clean_resp(self, resp: str) -> str:
131
  if not resp: return resp
132
  txt = resp.strip()
133
+
134
+ # Remove common conversational prefixes and comments
135
+ prefixes_to_remove = [
136
+ "Here's a rewritten version of",
137
+ "Here is a rewritten version of",
138
+ "Here's the rewritten text:",
139
+ "Here is the rewritten text:",
140
+ "Here's the translation:",
141
+ "Here is the translation:",
142
+ "Here's the enhanced text:",
143
+ "Here is the enhanced text:",
144
+ "Here's the improved text:",
145
+ "Here is the improved text:",
146
+ "Here's the medical context:",
147
+ "Here is the medical context:",
148
+ "Here's the cleaned text:",
149
+ "Here is the cleaned text:",
150
+ "Here's the answer:",
151
+ "Here is the answer:",
152
+ "Here's a paraphrased version:",
153
+ "Here is a paraphrased version:",
154
+ "Paraphrased version:",
155
+ "Paraphrased:",
156
+ "Sure,",
157
+ "Okay,",
158
+ "Certainly,",
159
+ "Of course,",
160
+ "I can help you with that.",
161
+ "I'll help you with that.",
162
+ "Let me help you with that.",
163
+ "I can rewrite that for you.",
164
+ "I'll rewrite that for you.",
165
+ "Let me rewrite that for you.",
166
+ "I can translate that for you.",
167
+ "I'll translate that for you.",
168
+ "Let me translate that for you.",
169
+ ]
170
+
171
+ # Remove prefixes
172
+ for prefix in prefixes_to_remove:
173
+ if txt.lower().startswith(prefix.lower()):
174
+ txt = txt[len(prefix):].strip()
175
+ break
176
+
177
+ # Remove common boilerplate prefixes with regex
178
+ import re
179
  for pat in [
180
  r"^Here is (a|the) .*?:\s*",
181
  r"^Paraphrased(?: version)?:\s*",
182
  r"^Sure[,.]?\s*",
183
+ r"^Okay[,.]?\s*",
184
+ r"^Certainly[,.]?\s*",
185
+ r"^Of course[,.]?\s*",
186
+ r"^I can .*?:\s*",
187
+ r"^I'll .*?:\s*",
188
+ r"^Let me .*?:\s*"
189
  ]:
 
190
  txt = re.sub(pat, "", txt, flags=re.I)
191
+
192
+ # Remove any remaining conversational elements
193
+ lines = txt.split('\n')
194
+ cleaned_lines = []
195
+ for line in lines:
196
+ line = line.strip()
197
+ if line and not any(phrase in line.lower() for phrase in [
198
+ "here's", "here is", "let me", "i can", "i'll", "sure,", "okay,",
199
+ "certainly,", "of course,", "i hope this helps", "hope this helps",
200
+ "does this help", "is this what you", "let me know if"
201
+ ]):
202
+ cleaned_lines.append(line)
203
+
204
+ return '\n'.join(cleaned_lines).strip()
205
 
206
  # ————— Paraphrase —————
207
  def paraphrase(self, text: str, difficulty: str = "easy", custom_prompt: str = None) -> str:
utils/local_llm.py CHANGED
@@ -145,13 +145,40 @@ class MedAlpacaClient:
145
  if not text:
146
  return text
147
 
148
- # Remove common prefixes and Alpaca format artifacts
149
  prefixes_to_remove = [
150
  "Answer:",
151
  "The answer is:",
152
  "Based on the information provided:",
153
  "Here's the answer:",
154
  "Here is the answer:",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  "### Response:",
156
  "Response:",
157
  "Below is an instruction",
@@ -161,7 +188,7 @@ class MedAlpacaClient:
161
 
162
  text = text.strip()
163
  for prefix in prefixes_to_remove:
164
- if text.startswith(prefix):
165
  text = text[len(prefix):].strip()
166
  break
167
 
@@ -170,8 +197,20 @@ class MedAlpacaClient:
170
  text = text.split("### Response:")[-1].strip()
171
  if "### Input:" in text:
172
  text = text.split("### Input:")[0].strip()
173
-
174
- return text
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def _snip(self, text: str, max_words: int = 12) -> str:
177
  """Truncate text for logging"""
 
145
  if not text:
146
  return text
147
 
148
+ # Remove common conversational prefixes and comments
149
  prefixes_to_remove = [
150
  "Answer:",
151
  "The answer is:",
152
  "Based on the information provided:",
153
  "Here's the answer:",
154
  "Here is the answer:",
155
+ "Here's a rewritten version:",
156
+ "Here is a rewritten version:",
157
+ "Here's the rewritten text:",
158
+ "Here is the rewritten text:",
159
+ "Here's the translation:",
160
+ "Here is the translation:",
161
+ "Here's the enhanced text:",
162
+ "Here is the enhanced text:",
163
+ "Here's the improved text:",
164
+ "Here is the improved text:",
165
+ "Here's the medical context:",
166
+ "Here is the medical context:",
167
+ "Here's the cleaned text:",
168
+ "Here is the cleaned text:",
169
+ "Sure,",
170
+ "Okay,",
171
+ "Certainly,",
172
+ "Of course,",
173
+ "I can help you with that.",
174
+ "I'll help you with that.",
175
+ "Let me help you with that.",
176
+ "I can rewrite that for you.",
177
+ "I'll rewrite that for you.",
178
+ "Let me rewrite that for you.",
179
+ "I can translate that for you.",
180
+ "I'll translate that for you.",
181
+ "Let me translate that for you.",
182
  "### Response:",
183
  "Response:",
184
  "Below is an instruction",
 
188
 
189
  text = text.strip()
190
  for prefix in prefixes_to_remove:
191
+ if text.lower().startswith(prefix.lower()):
192
  text = text[len(prefix):].strip()
193
  break
194
 
 
197
  text = text.split("### Response:")[-1].strip()
198
  if "### Input:" in text:
199
  text = text.split("### Input:")[0].strip()
200
+
201
+ # Remove any remaining conversational elements
202
+ lines = text.split('\n')
203
+ cleaned_lines = []
204
+ for line in lines:
205
+ line = line.strip()
206
+ if line and not any(phrase in line.lower() for phrase in [
207
+ "here's", "here is", "let me", "i can", "i'll", "sure,", "okay,",
208
+ "certainly,", "of course,", "i hope this helps", "hope this helps",
209
+ "does this help", "is this what you", "let me know if"
210
+ ]):
211
+ cleaned_lines.append(line)
212
+
213
+ return '\n'.join(cleaned_lines).strip()
214
 
215
  def _snip(self, text: str, max_words: int = 12) -> str:
216
  """Truncate text for logging"""