Spaces:

cicboy
/

AI_Blog_Writer

Sleeping

App Files Files Community

cicboy commited on Oct 14, 2025

Commit

f1e22c1

1 Parent(s): a370528

update hybrid_retriever_tool file and gradio UI

Browse files

Files changed (2) hide show

app.py +31 -12
tools/hybrid_retriever_tool.py +42 -10

app.py CHANGED Viewed

@@ -166,19 +166,38 @@ def generate_blog(topic, tone):
     yield final_text
 # Build Gradio Interface
-with gr.Blocks(css="""
 #output-box {
-        background-color: #f8f9fa;
-        border-radius: 12px;
-        padding: 1.5rem;
-        font-family: 'Inter', sans-serif;
-        font-size: 1rem;
-        line-height: 1.6;
-        white-space: pre-wrap;
-        overflow-y: auto;
-        max-height: 70vh;
-    }
-    """) as demo:
     gr.Markdown(
         """
         ## ✍️ AI Blog Writer Multi-Agent

     yield final_text
 # Build Gradio Interface
+css = """
 #output-box {
+    background-color: #f8f9fa;
+    border-radius: 12px;
+    padding: 1.5rem;
+    font-family: 'Inter', sans-serif;
+    font-size: 1rem;
+    line-height: 1.6;
+    white-space: pre-wrap;
+    overflow-y: auto;
+    max-height: 70vh;
+}
+#context-box h1, #context-box h2, #context-box h3 {
+    font-size: 1rem !important;
+    font-weight: 600 !important;
+}
+#context-box {
+    font-family: 'Inter', sans-serif;
+    font-size: 1rem;
+    line-height: 1.6;
+    background-color: #ffffff;
+    border: 1px solid #ddd;
+    border-radius: 10px;
+    padding: 1rem;
+    margin-top: 0.5rem;
+    max-height: 70vh;
+    overflow-y: auto;
+}
+"""
+with gr.Blocks(css=css) as demo:
     gr.Markdown(
         """
         ## ✍️ AI Blog Writer Multi-Agent

tools/hybrid_retriever_tool.py CHANGED Viewed

@@ -27,19 +27,51 @@ class HybridRetrieverTool(RagTool):
     # 🧹 Text Cleaning
     def _clean_text(self, text: str):
-        """Remove HTML, images, boilerplate; keep valuable text & extract URLs for citation."""
         urls = re.findall(r'https?://\S+', text)
         text = unescape(text)
-        text = re.sub(r"<[^>]+>", " ", text)                   # Remove HTML tags
-        text = re.sub(r"!\[.*?\]\(.*?\)", " ", text)           # Remove Markdown images
-        text = re.sub(r"\[.*?\]\(.*?\)", " ", text)            # Remove Markdown links
-        text = re.sub(r"\S+\.(jpg|jpeg|png|gif|svg|webp|pdf)", " ", text, flags=re.IGNORECASE)
-        text = re.sub(r"http\S+", " ", text)                   # Remove URLs inline
-        text = re.sub(r"(Share|Tweet|Email|Login|Subscribe|Learn More|Read More)+", " ", text, flags=re.IGNORECASE)
-        text = re.sub(r"\s+", " ", text).strip()               # Normalize spaces
-        text = re.sub(r"(Education Weekly Update.*?)+", "", text, flags=re.IGNORECASE)
-        if len(text.split()) < 10:
             return None, []
         return text, urls
     def _build_corpus(self, topic: str):

     # 🧹 Text Cleaning
     def _clean_text(self, text: str):
+        """
+        Clean Tavily content by removing HTML, bullets, boilerplate, and repetitive junk
+        while preserving high-value plain text and extracting source URLs for citation.
+        """
+        if not text or len(text.strip()) < 10:
+            return None, []
+        # Extract URLs for citation before cleaning
         urls = re.findall(r'https?://\S+', text)
+        # Decode HTML entities and remove tags
         text = unescape(text)
+        text = re.sub(r"<[^>]+>", " ", text)                      # strip HTML tags
+        text = re.sub(r"!\[.*?\]\(.*?\)", " ", text)              # remove Markdown images
+        text = re.sub(r"\[.*?\]\(.*?\)", " ", text)               # remove Markdown links
+        text = re.sub(r"\S+\.(jpg|jpeg|png|gif|svg|webp|pdf)", " ", text, flags=re.I)
+        text = re.sub(r"http\S+", " ", text)                      # remove URLs inline
+        # Remove layout and boilerplate junk
+        text = re.sub(r"(Share|Tweet|Email|Login|Subscribe|Learn More|Read More|Click Here)+", " ", text, flags=re.I)
+        text = re.sub(r"(Education Weekly Update.*?)+", " ", text, flags=re.I)
+        text = re.sub(r"(\bAI\s*\+\s*){2,}", "AI ", text)         # collapse 'AI + AI + AI'
+        text = re.sub(r"[•·●○◦‣⁃∙▪]+", " ", text)                # remove bullet symbols
+        text = re.sub(r"(?m)^\s*#.*$", " ", text)                 # remove markdown headers
+        text = re.sub(r"\b[A-Z]{2,}\b( [A-Z]{2,}\b)+", " ", text) # collapse ALLCAPS runs
+        text = text.replace("\xa0", " ")                          # remove non-breaking spaces
+        text = re.sub(r"\s{2,}", " ", text).strip()               # normalize whitespace
+        # Filter out boilerplate / short junk sections
+        if any(kw in text.lower() for kw in [
+            "education weekly update",
+            "copyright",
+            "terms of use",
+            "cookie policy",
+            "advertisement",
+            "site map",
+        ]):
+            return None, []
+        if len(text.split()) < 30:
             return None, []
+        # Normalize casing (optional but improves readability)
+        text = text[0].upper() + text[1:] if len(text) > 1 else text
         return text, urls
     def _build_corpus(self, topic: str):