Spaces:

cicboy
/

AI_Blog_Writer

Sleeping

App Files Files Community

cicboy commited on Oct 14, 2025

Commit

baf9e10

1 Parent(s): e30c01e

hybrid_retriever_tool for RAG and update application

Browse files

Files changed (2) hide show

app.py +22 -1
tools/hybrid_retriever_tool.py +70 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import os
 from dotenv import load_dotenv
 from pathlib import Path
 import gradio as gr
 # control warnings
 warnings.filterwarnings("ignore")
@@ -19,16 +20,21 @@ llm_writer = LLM(model="gpt-5-mini", temperature=1.0)
 llm_editor = LLM(model="gpt-4-turbo", temperature=0.3)
 llm_fact=LLM(model="gpt-4o-mini", temperature=0.3)
 #Creating Agents
 planner = Agent(
     role="Content Planner",
     goal="Plan engaging and factually accurate content on {topic}",
     backstory="You are working on planning a blog article about the topic {topic}."
     "You collect relevant information that helps the audience learn something and make informed decisions. "
     "Your work is the basis for the Content Writer to write an article on this topic.",
     allow_delegation=False,
     verbose=True,
     llm=llm_planner
 )
@@ -48,9 +54,11 @@ writer = Agent(
 fact_checker = Agent(
     role="Fact Checker",
     goal="Verify factual accuracy, detect unsupported claims and identify missing references or sources.",
-    backstory="You are a meticulous research analyst who checks every claim against known facts and relaible sources",
     allow_delegation=False,
     verbose=True,
     llm=llm_fact
 )
@@ -133,6 +141,15 @@ crew = Crew(
     verbose=True
 )
 # Define Gradio Handler
 def generate_blog(topic, tone):
     yield "⏳ Generating blog — this may take a few moments..."
@@ -180,10 +197,14 @@ with gr.Blocks(css="""
             label="Select Writing Tone",
             value="academic"
         )
     run_button = gr.Button("🚀 Generate Blog", variant="primary")
     output = gr.Textbox(label="📰 Generated Blog Post", elem_id="output-box", lines=25, interactive=False, show_label=False)
     run_button.click(generate_blog, inputs=[topic, tone], outputs=[output])
 #Launch app

 from dotenv import load_dotenv
 from pathlib import Path
 import gradio as gr
+from tools.hybrid_retriever_tool import HybridRetrieverTool
 # control warnings
 warnings.filterwarnings("ignore")
 llm_editor = LLM(model="gpt-4-turbo", temperature=0.3)
 llm_fact=LLM(model="gpt-4o-mini", temperature=0.3)
+#Define tools
+hybrid_tool = HybridRetrieverTool(alpha=0.6) #including RAG in search
 #Creating Agents
 planner = Agent(
     role="Content Planner",
     goal="Plan engaging and factually accurate content on {topic}",
     backstory="You are working on planning a blog article about the topic {topic}."
+    "Use the retriever tool to gather accurate, recent information before outlining." #RAG search
     "You collect relevant information that helps the audience learn something and make informed decisions. "
     "Your work is the basis for the Content Writer to write an article on this topic.",
     allow_delegation=False,
     verbose=True,
+    tools = [hybrid_tool] #Rag search
     llm=llm_planner
 )
 fact_checker = Agent(
     role="Fact Checker",
     goal="Verify factual accuracy, detect unsupported claims and identify missing references or sources.",
+    backstory="You are a meticulous research analyst who checks every claim against known facts and relaible sources"
+    "Use the retriever tool to cross-check the Content Writer's statements against reliable, recent information.",
     allow_delegation=False,
     verbose=True,
+    tools = [hybrid_tool], #Rag search
     llm=llm_fact
 )
     verbose=True
 )
+# fetch context for RAG search
+def fetch_context(topic):
+    passages = hybrid_tool._run(topic)
+    if isinstance(passages, str):
+        summary = passages
+    else:
+        summary = hybrid_tool.summarize_passages(topic, passages)
+    return summary
 # Define Gradio Handler
 def generate_blog(topic, tone):
     yield "⏳ Generating blog — this may take a few moments..."
             label="Select Writing Tone",
             value="academic"
         )
+    fetch_btn = gr.Button("🌐 Fetch & Summarize Context", variant="secondary") # Rag Search
+    context_output = gr.Markdown(label="📚 Retrieved Context Summary") # Rag Search
     run_button = gr.Button("🚀 Generate Blog", variant="primary")
     output = gr.Textbox(label="📰 Generated Blog Post", elem_id="output-box", lines=25, interactive=False, show_label=False)
+    fetch_btn.click(fetch_context, inputs=[topic], outputs=[context_output]) # Rag Search
     run_button.click(generate_blog, inputs=[topic, tone], outputs=[output])
 #Launch app

tools/hybrid_retriever_tool.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import numpy as np
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer
+from tavily import TavilyClient
+from openai import OpenAI
+import os
+class HybridRetrieverTool:
+    """
+    Dynamically builds a hybrid BM25 + semantic retriever from live Tavily results.
+    """
+    def __init__(self, alpha=0.6):
+        self.alpha = alpha
+        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
+        self.tavily = TavilyClient(api_key=os.getenv("TAVILITY_API_KEY"))
+        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    def _build_corpus(self, topic):
+        """Fetch up-to-date search results."""
+        results = self.tavily.search(query=topic, max_results=30)
+        corpus = []
+        for r in results.get("results", []):
+            content = r.get("content") or ""
+            if len(content.strip()) > 0:
+                corpus.append(content)
+        return corpus
+    def _run(self, query, top_k=8):
+        """
+        Run hybrid search: BM25 + semantic similarity.
+        """
+        corpus = self._build_corpus(query)
+        if not corpus:
+            return "No relevant content found."
+        bm25 = BM25Okapi([doc.split() for doc in corpus])
+        bm25_scores = np.array(bm25.get(query.split()))
+        emb_corpus = self.embedder.encode(corpus, convert_to_numpy=True, normalize_embeddings=True)
+        emb_query = self.embedder.encode(query, convert_to_numpy=True, normalize_embeddings=True)
+        sem_scores = np.dot(emb_corpus, emb_query)
+        # Normalize scores
+        bm25_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.ptp() + 1e-8)
+        sem_norm = (sem_scores - sem_scores.min()) / (sem_scores.ptp() + 1e-8)
+        # Weighted fusion
+        hybrid_scores = self.alpha * sem_norm + (1 - self.alpha) * bm25_norm
+        top_indices= np.argsort(hybrid_scores)[::-1][:top_k]
+        top_passages = [corpus[i] for i in top_indices]
+        return "\n\n".join(top_passages)
+    def summarize_passages(self, topic, passages):
+        if isinstance(passages, str):
+            passages = [passages]
+        text_block = "\n".join(passages)
+        try:
+            response = self.client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": "You are an expert summarizer."},
+                    {"role": "user", "content": f"Summarize these passages about {topic}"}
+                ],
+                temperature=0.3
+            )
+            return response.choices[0].message.content.strip()
+        except Exception as e:
+            return f"Summarization failed: {e}"