cicboy commited on
Commit
f1e22c1
·
1 Parent(s): a370528

update hybrid_retriever_tool file and gradio UI

Browse files
Files changed (2) hide show
  1. app.py +31 -12
  2. tools/hybrid_retriever_tool.py +42 -10
app.py CHANGED
@@ -166,19 +166,38 @@ def generate_blog(topic, tone):
166
  yield final_text
167
 
168
  # Build Gradio Interface
169
- with gr.Blocks(css="""
170
  #output-box {
171
- background-color: #f8f9fa;
172
- border-radius: 12px;
173
- padding: 1.5rem;
174
- font-family: 'Inter', sans-serif;
175
- font-size: 1rem;
176
- line-height: 1.6;
177
- white-space: pre-wrap;
178
- overflow-y: auto;
179
- max-height: 70vh;
180
- }
181
- """) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  gr.Markdown(
183
  """
184
  ## ✍️ AI Blog Writer Multi-Agent
 
166
  yield final_text
167
 
168
  # Build Gradio Interface
169
+ css = """
170
  #output-box {
171
+ background-color: #f8f9fa;
172
+ border-radius: 12px;
173
+ padding: 1.5rem;
174
+ font-family: 'Inter', sans-serif;
175
+ font-size: 1rem;
176
+ line-height: 1.6;
177
+ white-space: pre-wrap;
178
+ overflow-y: auto;
179
+ max-height: 70vh;
180
+ }
181
+
182
+ #context-box h1, #context-box h2, #context-box h3 {
183
+ font-size: 1rem !important;
184
+ font-weight: 600 !important;
185
+ }
186
+ #context-box {
187
+ font-family: 'Inter', sans-serif;
188
+ font-size: 1rem;
189
+ line-height: 1.6;
190
+ background-color: #ffffff;
191
+ border: 1px solid #ddd;
192
+ border-radius: 10px;
193
+ padding: 1rem;
194
+ margin-top: 0.5rem;
195
+ max-height: 70vh;
196
+ overflow-y: auto;
197
+ }
198
+ """
199
+
200
+ with gr.Blocks(css=css) as demo:
201
  gr.Markdown(
202
  """
203
  ## ✍️ AI Blog Writer Multi-Agent
tools/hybrid_retriever_tool.py CHANGED
@@ -27,19 +27,51 @@ class HybridRetrieverTool(RagTool):
27
 
28
  # 🧹 Text Cleaning
29
  def _clean_text(self, text: str):
30
- """Remove HTML, images, boilerplate; keep valuable text & extract URLs for citation."""
 
 
 
 
 
 
 
31
  urls = re.findall(r'https?://\S+', text)
 
 
32
  text = unescape(text)
33
- text = re.sub(r"<[^>]+>", " ", text) # Remove HTML tags
34
- text = re.sub(r"!\[.*?\]\(.*?\)", " ", text) # Remove Markdown images
35
- text = re.sub(r"\[.*?\]\(.*?\)", " ", text) # Remove Markdown links
36
- text = re.sub(r"\S+\.(jpg|jpeg|png|gif|svg|webp|pdf)", " ", text, flags=re.IGNORECASE)
37
- text = re.sub(r"http\S+", " ", text) # Remove URLs inline
38
- text = re.sub(r"(Share|Tweet|Email|Login|Subscribe|Learn More|Read More)+", " ", text, flags=re.IGNORECASE)
39
- text = re.sub(r"\s+", " ", text).strip() # Normalize spaces
40
- text = re.sub(r"(Education Weekly Update.*?)+", "", text, flags=re.IGNORECASE)
41
- if len(text.split()) < 10:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  return None, []
 
 
 
 
43
  return text, urls
44
 
45
  def _build_corpus(self, topic: str):
 
27
 
28
  # 🧹 Text Cleaning
29
  def _clean_text(self, text: str):
30
+ """
31
+ Clean Tavily content by removing HTML, bullets, boilerplate, and repetitive junk
32
+ while preserving high-value plain text and extracting source URLs for citation.
33
+ """
34
+ if not text or len(text.strip()) < 10:
35
+ return None, []
36
+
37
+ # Extract URLs for citation before cleaning
38
  urls = re.findall(r'https?://\S+', text)
39
+
40
+ # Decode HTML entities and remove tags
41
  text = unescape(text)
42
+ text = re.sub(r"<[^>]+>", " ", text) # strip HTML tags
43
+ text = re.sub(r"!\[.*?\]\(.*?\)", " ", text) # remove Markdown images
44
+ text = re.sub(r"\[.*?\]\(.*?\)", " ", text) # remove Markdown links
45
+ text = re.sub(r"\S+\.(jpg|jpeg|png|gif|svg|webp|pdf)", " ", text, flags=re.I)
46
+ text = re.sub(r"http\S+", " ", text) # remove URLs inline
47
+
48
+ # Remove layout and boilerplate junk
49
+ text = re.sub(r"(Share|Tweet|Email|Login|Subscribe|Learn More|Read More|Click Here)+", " ", text, flags=re.I)
50
+ text = re.sub(r"(Education Weekly Update.*?)+", " ", text, flags=re.I)
51
+ text = re.sub(r"(\bAI\s*\+\s*){2,}", "AI ", text) # collapse 'AI + AI + AI'
52
+ text = re.sub(r"[•·●○◦‣⁃∙▪]+", " ", text) # remove bullet symbols
53
+ text = re.sub(r"(?m)^\s*#.*$", " ", text) # remove markdown headers
54
+ text = re.sub(r"\b[A-Z]{2,}\b( [A-Z]{2,}\b)+", " ", text) # collapse ALLCAPS runs
55
+ text = text.replace("\xa0", " ") # remove non-breaking spaces
56
+ text = re.sub(r"\s{2,}", " ", text).strip() # normalize whitespace
57
+
58
+ # Filter out boilerplate / short junk sections
59
+ if any(kw in text.lower() for kw in [
60
+ "education weekly update",
61
+ "copyright",
62
+ "terms of use",
63
+ "cookie policy",
64
+ "advertisement",
65
+ "site map",
66
+ ]):
67
+ return None, []
68
+
69
+ if len(text.split()) < 30:
70
  return None, []
71
+
72
+ # Normalize casing (optional but improves readability)
73
+ text = text[0].upper() + text[1:] if len(text) > 1 else text
74
+
75
  return text, urls
76
 
77
  def _build_corpus(self, topic: str):