Spaces:

achraf2203
/

RAG-Chatbot

Sleeping

App Files Files Community

mohamedachraf commited on Aug 4, 2025

Commit

fc83ecf

1 Parent(s): 7a5d2ae

modify the pipeline

Browse files

Files changed (1) hide show

app.py +28 -19

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from langchain.prompts.prompt import PromptTemplate
 from langchain.vectorstores.base import VectorStoreRetriever
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
 from transformers import TextIteratorStreamer
@@ -28,8 +28,10 @@ import os
 import tempfile
-# Prompt template
-template = """Context: {context}
 Question: {question}
@@ -37,15 +39,16 @@ Answer:"""
 QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
-# Load Phi-2 model from hugging face hub
-model_id = "microsoft/phi-2"
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-model = AutoModelForCausalLM.from_pretrained(
-    model_id, torch_dtype=torch.float32, trust_remote_code=True
 )
 # sentence transformers to be used in vector store
@@ -121,6 +124,12 @@ def get_retrieval_qa_chain(text_file, hf_model):
     if text_file != default_text_file or default_text_file is None:
         if text_file is not None and os.path.exists(text_file):
             retriever, vectorstore = prepare_vector_store_retriever(text_file)
     chain = RetrievalQA.from_chain_type(
         llm=hf_model,
@@ -137,15 +146,14 @@ def generate(question, answer, text_file, max_new_tokens):
         return
     try:
-        # Create pipeline without streamer first to test
         phi2_pipeline = pipeline(
-            "text-generation",
             model=model,
             tokenizer=tokenizer,
             max_new_tokens=max_new_tokens,
-            do_sample=False,               # ← greedy
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
         )
         hf_model = HuggingFacePipeline(pipeline=phi2_pipeline)
@@ -192,13 +200,14 @@ def upload_file(file):
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-  # Retrieval Augmented Generation with Phi-2: Question Answering demo
-  ### This demo uses the Phi-2 language model and Retrieval Augmented Generation (RAG). It allows you to upload a txt or PDF file and ask the model questions related to the content of that file.
   ### Features:
   - Support for both PDF and text files
   - Retrieval-based question answering using document context
   ### To get started, upload a text (.txt) or PDF (.pdf) file using the upload button below.
-  The context size of the Phi-2 model is 2048 tokens, so large documents are automatically split into chunks.
   Retrieval Augmented Generation (RAG) enables us to retrieve just the few small chunks of the document that are relevant to your query and inject it into our prompt.
   The model is then able to answer questions by incorporating knowledge from the newly provided document.
   """

 from langchain.vectorstores.base import VectorStoreRetriever
 import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
 from transformers import TextIteratorStreamer
 import tempfile
+# Prompt template optimized for Flan-T5
+template = """Answer the question based on the context below.
+Context: {context}
 Question: {question}
 QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
+# Load Flan-T5 model from hugging face hub - excellent for CPU and Q&A tasks
+# Alternative popular CPU-friendly models you can try:
+# - "google/flan-t5-small" (faster, smaller)
+# - "google/flan-t5-large" (better quality, slower)
+# - "microsoft/DialoGPT-medium" (conversational)
+model_id = "google/flan-t5-base"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    model_id, torch_dtype=torch.float32
 )
 # sentence transformers to be used in vector store
     if text_file != default_text_file or default_text_file is None:
         if text_file is not None and os.path.exists(text_file):
             retriever, vectorstore = prepare_vector_store_retriever(text_file)
+        else:
+            # Create a dummy retriever if no file is available
+            dummy_doc = Document(page_content="No document loaded. Please upload a file to get started.")
+            dummy_vectorstore = FAISS.from_documents([dummy_doc], embeddings)
+            retriever = VectorStoreRetriever(vectorstore=dummy_vectorstore, search_kwargs={"k": 1})
+            vectorstore = dummy_vectorstore
     chain = RetrievalQA.from_chain_type(
         llm=hf_model,
         return
     try:
+        # Create pipeline for text2text generation (Flan-T5)
         phi2_pipeline = pipeline(
+            "text2text-generation",
             model=model,
             tokenizer=tokenizer,
             max_new_tokens=max_new_tokens,
+            do_sample=False,
+            return_full_text=False,
         )
         hf_model = HuggingFacePipeline(pipeline=phi2_pipeline)
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+  # Retrieval Augmented Generation with Flan-T5: Question Answering demo
+  ### This demo uses Google's Flan-T5 language model and Retrieval Augmented Generation (RAG). It allows you to upload a txt or PDF file and ask the model questions related to the content of that file.
   ### Features:
   - Support for both PDF and text files
   - Retrieval-based question answering using document context
+  - Optimized for CPU performance using Flan-T5-Base model
   ### To get started, upload a text (.txt) or PDF (.pdf) file using the upload button below.
+  The Flan-T5 model is efficient and works well on CPU, making it perfect for document Q&A tasks.
   Retrieval Augmented Generation (RAG) enables us to retrieve just the few small chunks of the document that are relevant to your query and inject it into our prompt.
   The model is then able to answer questions by incorporating knowledge from the newly provided document.
   """