Spaces:

MusaR
/

NLP-RAG-world-news

Sleeping

App Files Files Community

MusaR commited on Jun 24, 2025

Commit

e8a5efc

1 Parent(s): 3d2ba7a

Upload 4 files

Browse files

Files changed (4) hide show

README.md +12 -12
app.py +58 -0
pipeline.py +190 -0
requirements.txt +13 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: NLP RAG World News
-emoji: 🏆
-colorFrom: yellow
-colorTo: gray
-sdk: gradio
-sdk_version: 5.34.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: NLP RAG World News
+emoji: 🏆
+colorFrom: yellow
+colorTo: gray
+sdk: gradio
+sdk_version: 5.34.2
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+from pipeline import RAGPipeline
+# --- Load the pipeline once globally ---
+# This is crucial for performance, so models are not reloaded on every request.
+print("Initializing RAG Pipeline...")
+try:
+    rag_pipeline = RAGPipeline(artifacts_dir="rag_artifacts")
+    print("RAG Pipeline initialized successfully.")
+except Exception as e:
+    print(f"FATAL: Failed to initialize RAG Pipeline: {e}")
+    # If the pipeline fails to load, we can't run the app.
+    # We'll display an error in the Gradio interface.
+    rag_pipeline = None
+# --- Define the function that Gradio will call ---
+def get_answer_from_pipeline(query):
+    if rag_pipeline is None:
+        return "Error: The RAG pipeline failed to load. Please check the server logs.", ""
+    print(f"Processing query in Gradio app: {query}")
+    try:
+        answer, _, sources = rag_pipeline.answer_query(query)
+        # Combine the answer and sources into a single string for display
+        full_response = answer + "\n\n" + sources
+        return full_response
+    except Exception as e:
+        print(f"Error during query processing: {e}")
+        return f"An error occurred while processing your request: {e}", ""
+# --- Build the Gradio Interface ---
+title = "Ask the News: A RAG system for World News Articles"
+description = """
+This demo showcases a Retrieval-Augmented Generation (RAG) system built from scratch.
+Enter a question about world events (e.g., Brexit, COVID-19, geopolitical conflicts),
+and the system will retrieve relevant articles from a 30,000-document dataset and generate an answer.
+"""
+examples = [
+    "What were the main arguments for and against Brexit?",
+    "What was the initial response to the COVID-19 outbreak?",
+    "Tell me about the conflict in South Ossetia in 2008."
+]
+# Using the gr.Markdown component to correctly render the links
+iface = gr.Interface(
+    fn=get_answer_from_pipeline,
+    inputs=gr.Textbox(lines=2, placeholder="e.g., What happened with Brexit?", label="Question"),
+    outputs=gr.Markdown(label="Answer"), # Using Markdown to render links
+    title=title,
+    description=description,
+    examples=examples,
+    allow_flagging="never",
+    theme=gr.themes.Soft()
+)
+# --- Launch the App ---
+if __name__ == "__main__":
+    iface.launch()

pipeline.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import os
+import gc
+import time
+import pickle
+from pathlib import Path
+import warnings
+import pandas as pd
+import numpy as np
+from tqdm.auto import tqdm
+import nltk
+import faiss
+import torch
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# --- Basic Configuration ---
+warnings.filterwarnings("ignore")
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+nltk.download('punkt', quiet=True)
+RANDOM_SEED = 42
+np.random.seed(RANDOM_SEED)
+torch.manual_seed(RANDOM_SEED)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(RANDOM_SEED)
+DEVICE = "cpu"
+class RAGPipeline:
+    def __init__(self, artifacts_dir="rag_artifacts"):
+        self.artifacts_dir = Path(artifacts_dir)
+        self.df = None
+        self.chunks_df = None
+        self.bm25 = None
+        self.index_faiss = None
+        self.embedding_model = None
+        self.reranker_model = None
+        self.llm_model = None
+        self.llm_tokenizer = None
+        self.load_artifacts()
+        self.load_models()
+    def load_artifacts(self):
+        print(f"--> Loading artifacts from {self.artifacts_dir}")
+        self.df = pd.read_parquet(self.artifacts_dir / "final_df.parquet")
+        self.chunks_df = pd.read_parquet(self.artifacts_dir / "chunks_df.parquet")
+        print(f"Loaded {len(self.df)} documents and {len(self.chunks_df)} chunks.")
+        with open(self.artifacts_dir / "bm25_index.pkl", "rb") as f:
+            self.bm25 = pickle.load(f)
+        print("Loaded BM25 index.")
+        self.index_faiss = faiss.read_index(str(self.artifacts_dir / "news_chunks.faiss_index"))
+        print(f"Loaded FAISS index with {self.index_faiss.ntotal} vectors.")
+    def load_models(self):
+        print("--> Loading models...")
+        # Dense Retriever
+        EMBEDDING_MODEL_NAME = 'multi-qa-MiniLM-L6-cos-v1'
+        self.embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=DEVICE)
+        print(f"Embedding model '{EMBEDDING_MODEL_NAME}' loaded.")
+        # Reranker
+        CROSS_ENCODER_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
+        self.reranker_model = CrossEncoder(CROSS_ENCODER_MODEL_NAME, device=DEVICE, max_length=512)
+        print(f"Reranker model '{CROSS_ENCODER_MODEL_NAME}' loaded.")
+        # LLM
+        LLM_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
+        print(f"Loading LLM: {LLM_MODEL_NAME}...")
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, trust_remote_code=True)
+        self.llm_model = AutoModelForCausalLM.from_pretrained(
+            LLM_MODEL_NAME,
+            trust_remote_code=True
+        )
+        self.llm_model.to(DEVICE)
+        self.llm_model.eval()
+        if self.llm_tokenizer.pad_token is None:
+            self.llm_tokenizer.pad_token = self.llm_tokenizer.eos_token
+            if hasattr(self.llm_model, 'config'):
+                self.llm_model.config.pad_token_id = self.llm_model.config.eos_token_id
+        print("LLM loaded successfully.")
+    def search_bm25(self, query: str, k: int = 5):
+        tokenized_query = query.lower().split()
+        scores = self.bm25.get_scores(tokenized_query)
+        topk_indices_scores = sorted(zip(range(len(scores)), scores), key=lambda x: x[1], reverse=True)[:k]
+        results = []
+        for i, score in topk_indices_scores:
+            chunk_info = self.chunks_df.iloc[i]
+            results.append({
+                'chunk_id': chunk_info['chunk_id'], 'doc_id': chunk_info['doc_id'],
+                'score': score, 'text': chunk_info['chunk_text'],
+                'title': chunk_info['original_title'], 'url': chunk_info['original_url']
+            })
+        return results
+    def search_faiss(self, query: str, k: int = 5):
+        query_embedding = self.embedding_model.encode(query, convert_to_tensor=True, device=DEVICE)
+        query_embedding_cpu = query_embedding.cpu().numpy().reshape(1, -1)
+        faiss.normalize_L2(query_embedding_cpu)
+        distances, indices = self.index_faiss.search(query_embedding_cpu, k)
+        results = []
+        for i in range(len(indices[0])):
+            idx = indices[0][i]
+            score = distances[0][i]
+            chunk_info = self.chunks_df.iloc[idx]
+            results.append({
+                'chunk_id': chunk_info['chunk_id'], 'doc_id': chunk_info['doc_id'],
+                'score': score, 'text': chunk_info['chunk_text'],
+                'title': chunk_info['original_title'], 'url': chunk_info['original_url']
+            })
+        return results
+    def hybrid_search_and_rerank(self, query: str, bm25_k: int = 20, faiss_k: int = 20, rerank_top_n: int = 5):
+        bm25_res = self.search_bm25(query, k=bm25_k)
+        faiss_res = self.search_faiss(query, k=faiss_k)
+        combined_results_dict = {}
+        for res_item in bm25_res + faiss_res:
+            chunk_id = res_item['chunk_id']
+            if chunk_id not in combined_results_dict:
+                combined_results_dict[chunk_id] = res_item
+        candidate_chunks = list(combined_results_dict.values())
+        if not candidate_chunks:
+            return []
+        reranker_pairs = [[query, chunk['text']] for chunk in candidate_chunks]
+        rerank_scores = self.reranker_model.predict(reranker_pairs, show_progress_bar=False)
+        for chunk, score in zip(candidate_chunks, rerank_scores):
+            chunk['rerank_score'] = score
+        reranked_results = sorted(candidate_chunks, key=lambda x: x['rerank_score'], reverse=True)
+        return reranked_results[:rerank_top_n]
+    def format_rag_prompt(self, query: str, context_chunks: list):
+        context_str = "\n\n---\n\n".join([chunk['text'] for chunk in context_chunks])
+        system_message = "You are a helpful AI assistant. Answer the user's QUESTION based *only* on the provided CONTEXT. If the context does not contain the answer, say 'I cannot answer the question based on the provided context.' Do not use any prior knowledge. Be concise and directly answer the question."
+        user_message_content = f"CONTEXT:\n{context_str}\n\nQUESTION: {query}"
+        messages = [
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": user_message_content}
+        ]
+        prompt = self.llm_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        return prompt
+    def generate_llm_answer(self, query: str, context_chunks: list):
+        if not context_chunks:
+            return "No relevant context found to answer the question.", []
+        formatted_prompt = self.format_rag_prompt(query, context_chunks)
+        inputs = self.llm_tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True, max_length=3800).to(DEVICE)
+        generation_args = {
+            "max_new_tokens": 250, "temperature": 0.1, "do_sample": True,
+            "top_p": 0.9, "eos_token_id": self.llm_tokenizer.eos_token_id,
+            "pad_token_id": self.llm_tokenizer.pad_token_id
+        }
+        with torch.no_grad():
+            output_ids = self.llm_model.generate(**inputs, **generation_args)
+        answer = self.llm_tokenizer.decode(output_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+        return answer.strip(), context_chunks
+    def answer_query(self, query: str):
+        print(f"Received query: {query}")
+        # 1. Retrieve and Rerank
+        retrieved_context = self.hybrid_search_and_rerank(query, bm25_k=15, faiss_k=15, rerank_top_n=3)
+        if not retrieved_context:
+            return "Could not find any relevant documents to answer your question.", [], "No context found."
+        # 2. Generate Answer
+        llm_answer, used_context_chunks = self.generate_llm_answer(query, retrieved_context)
+        # 3. Format sources
+        sources_text = "\n\n**Sources:**\n"
+        seen_urls = set()
+        for i, chunk in enumerate(used_context_chunks):
+            if chunk['url'] not in seen_urls:
+                sources_text += f"- [{chunk['title']}]({chunk['url']})\n"
+                seen_urls.add(chunk['url'])
+        return llm_answer, used_context_chunks, sources_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+pandas
+pyarrow
+datasets==2.19.*
+sentence-transformers==2.7.0
+faiss-cpu==1.8.0
+rank_bm25==0.2.2
+nltk==3.8.1
+tqdm==4.66.1
+transformers==4.40.*
+accelerate==0.29.*
+langchain
+gradio
+torch