Subha95 commited on
Commit
1ae31f5
·
verified ·
1 Parent(s): 0c9a8f6

Update chatbot_rag.py

Browse files
Files changed (1) hide show
  1. chatbot_rag.py +17 -10
chatbot_rag.py CHANGED
@@ -88,25 +88,32 @@ def build_qa():
88
 
89
 
90
  def hf_to_str(x):
91
- """Convert Hugging Face pipeline output to plain string (clean + generalized)."""
92
  if isinstance(x, list) and "generated_text" in x[0]:
93
  text = x[0]["generated_text"]
94
  else:
95
  text = str(x)
96
 
97
- # 1. Remove markdown/code artifacts
98
- text = text.replace("```", "").replace("#", "").strip()
 
 
99
 
100
- # 2. Normalize whitespace & line breaks
101
- text = re.sub(r"\s+", " ", text)
102
 
103
- # 3. Remove duplicated consecutive phrases (up to ~5 words repeated)
104
- text = re.sub(r"\b(\w+\s+){1,5}(\1){2,}", r"\1", text)
105
 
106
- # 4. Trim leading/trailing junk
107
- text = text.strip(" .,-\n\t")
 
 
 
 
108
 
109
- return text
 
110
 
111
 
112
  # 7. RAG chain
 
88
 
89
 
90
  def hf_to_str(x):
91
+ """Convert Hugging Face pipeline output to clean plain text."""
92
  if isinstance(x, list) and "generated_text" in x[0]:
93
  text = x[0]["generated_text"]
94
  else:
95
  text = str(x)
96
 
97
+ # 1. Remove code-like artifacts
98
+ text = re.sub(r"def\s+\w+\(.*?\):.*", "", text, flags=re.DOTALL)
99
+ text = re.sub(r"(print\(.*?\))", "", text)
100
+ text = re.sub(r"text\s*\+=.*", "", text)
101
 
102
+ # 2. Remove markdown/code fences
103
+ text = text.replace("```", "").replace("'''", "").replace('"""', "")
104
 
105
+ # 3. Normalize whitespace & line breaks
106
+ text = re.sub(r"\s+", " ", text)
107
 
108
+ # 4. Remove repeated sentences (simple dedupe)
109
+ sentences = []
110
+ for s in re.split(r"(?<=[.!?])\s+", text):
111
+ if s not in sentences:
112
+ sentences.append(s)
113
+ text = " ".join(sentences)
114
 
115
+ # 5. Trim
116
+ return text.strip()
117
 
118
 
119
  # 7. RAG chain