sofzcc commited on
Commit
df86717
·
verified ·
1 Parent(s): 3f651bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -3
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import glob
3
  import yaml
@@ -102,7 +103,45 @@ def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
102
  start += chunk_size - overlap
103
 
104
  return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
 
 
 
106
 
107
  def load_file_text(path: str) -> str:
108
  """Load text from various file formats with error handling"""
@@ -342,6 +381,8 @@ class RAGIndex:
342
  **inputs,
343
  max_new_tokens=128,
344
  do_sample=False,
 
 
345
  )
346
 
347
  answer = self.qa_tokenizer.decode(
@@ -386,6 +427,8 @@ class RAGIndex:
386
 
387
  combined_text = "\n\n".join(combined_context)
388
 
 
 
389
 
390
  # Limit context length to keep it manageable
391
  max_context_chars = 4000
@@ -395,15 +438,19 @@ class RAGIndex:
395
  # Prompt for the generative model
396
  prompt = (
397
  "You are an AI assistant that answers questions using only the provided context.\n"
398
- "- Do NOT copy large passages from the context.\n"
399
- "- Do NOT mention file names or sources in your answer.\n"
 
 
 
400
  "- If the answer cannot be found in the context, reply exactly with: "
401
  "\"I don't know based on the provided documents.\"\n\n"
402
  f"Context:\n{combined_text}\n\n"
403
  f"Question: {question}\n\n"
404
- "Answer in 1–3 concise sentences:"
405
  )
406
 
 
407
  try:
408
  answer_text = self._generate_from_context(prompt)
409
  except Exception as e:
 
1
+ import re
2
  import os
3
  import glob
4
  import yaml
 
103
  start += chunk_size - overlap
104
 
105
  return chunks
106
+
107
+
108
+ def clean_context_text(text: str) -> str:
109
+ """
110
+ Clean raw document context before sending to the generator:
111
+ - Remove markdown headings (#, ##, ###)
112
+ - Remove list markers (1., 2), -, *)
113
+ - Remove duplicate lines
114
+ """
115
+ lines = text.splitlines()
116
+ cleaned = []
117
+ seen = set()
118
+
119
+ for line in lines:
120
+ l = line.strip()
121
+ if not l:
122
+ continue
123
+
124
+ # Remove markdown headings like "# 1. Title", "## Section"
125
+ l = re.sub(r"^#+\s*", "", l)
126
+
127
+ # Remove ordered list prefixes like "1. ", "2) "
128
+ l = re.sub(r"^\d+[\.\)]\s*", "", l)
129
+
130
+ # Remove bullet markers like "- ", "* "
131
+ l = re.sub(r"^[-*]\s*", "", l)
132
+
133
+ # Skip very short "noise" lines
134
+ if len(l) < 5:
135
+ continue
136
+
137
+ # Avoid exact duplicates
138
+ if l in seen:
139
+ continue
140
+ seen.add(l)
141
 
142
+ cleaned.append(l)
143
+
144
+ return "\n".join(cleaned)
145
 
146
  def load_file_text(path: str) -> str:
147
  """Load text from various file formats with error handling"""
 
381
  **inputs,
382
  max_new_tokens=128,
383
  do_sample=False,
384
+ top_p=0.9,
385
+ temperature=0.7,
386
  )
387
 
388
  answer = self.qa_tokenizer.decode(
 
427
 
428
  combined_text = "\n\n".join(combined_context)
429
 
430
+ # Clean markdown / numbering / duplicates
431
+ combined_text = clean_context_text(combined_text)
432
 
433
  # Limit context length to keep it manageable
434
  max_context_chars = 4000
 
438
  # Prompt for the generative model
439
  prompt = (
440
  "You are an AI assistant that answers questions using only the provided context.\n"
441
+ "Your task is to synthesize a clear, natural explanation in your own words.\n"
442
+ "- Do NOT copy headings or section numbers from the context.\n"
443
+ "- Do NOT include markdown like '#', '##', '---', or bullet/list markers.\n"
444
+ "- Do NOT mention file names, sources, or internal labels in your answer.\n"
445
+ "- Do NOT just repeat full sentences from the context; always paraphrase.\n"
446
  "- If the answer cannot be found in the context, reply exactly with: "
447
  "\"I don't know based on the provided documents.\"\n\n"
448
  f"Context:\n{combined_text}\n\n"
449
  f"Question: {question}\n\n"
450
+ "Answer in 1–3 concise sentences of plain text:"
451
  )
452
 
453
+
454
  try:
455
  answer_text = self._generate_from_context(prompt)
456
  except Exception as e: