Spaces:

pradeepsengarr
/

RAG

Runtime error

App Files Files Community

pradeepsengarr commited on Jun 11, 2025

Commit

03245d6

verified ·

1 Parent(s): 2c3bba0

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -207

app.py CHANGED Viewed

@@ -1,212 +1,60 @@
 import gradio as gr
-from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM, BitsAndBytesConfig
 import faiss
-import numpy as np
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-import fitz
-import os
 import torch
-# --- Global Variables ---
-index = None
-doc_texts = []
-hf_token = os.environ.get("HF_TOKEN") # Get the Hugging Face token
-# Language Codes for given languages
-lang_map = {
-    "English": "eng_Latn",
-    "Hindi": "hin_Deva",
-    "Marathi": "mar_Deva",
-    "Punjabi": "pan_Guru"
-}
-# --- Model Loading (will be loaded once on Space startup) ---
-# For Embedding - Using a smaller, more CPU-friendly SentenceTransformer model
-# This model is generally small enough that quantization isn't critically needed for it.
-embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", token=hf_token)
-# For LLM - Using "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-llm_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-tokenizer = AutoTokenizer.from_pretrained(llm_model_id, token=hf_token)
-# --- Quantization Configuration ---
-# Choose one of the following quantization methods based on your needs and resources:
-# Option 1: 8-bit quantization (generally good balance of performance and memory)
-# Requires `bitsandbytes` library: pip install bitsandbytes accelerate
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True,
-    bnb_8bit_compute_dtype=torch.float16 # Use float16 for compute if possible
-)
-# Option 2: 4-bit quantization (most aggressive memory reduction, potential small accuracy hit)
-# Requires `bitsandbytes` library: pip install bitsandbytes accelerate
-# quantization_config = BitsAndBytesConfig(
-#     load_in_4bit=True,
-#     bnb_4bit_quant_type="nf4", # NormalFloat 4-bit
-#     bnb_4bit_compute_dtype=torch.float16, # Use float16 for compute if possible
-#     bnb_4bit_use_double_quant=True, # Double quantization for slightly better precision
-# )
-# Load the LLM with quantization
-model = AutoModelForCausalLM.from_pretrained(
-    llm_model_id,
-    quantization_config=quantization_config, # Apply the quantization config
-    device_map="auto", # Automatically places model parts, often on CPU for 8bit/4bit on CPU-only Spaces
-    token=hf_token
-)
-llm = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-    max_new_tokens=300,
-    do_sample=True,
-    temperature=0.7,
-)
-# Load a smaller FB Translation Model
-# NLLB-200M is still relatively big. Quantizing it can be tricky for Seq2Seq models
-# with `bitsandbytes` directly for generation quality. If OOM issues persist,
-# consider a much smaller NLLB variant, or a different approach for translation.
-nllb_id = "facebook/nllb-200-distilled-600M" # This model is 600M params, can still be large
-nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_id)
-nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
-    nllb_id,
-    # For NLLB, direct bitsandbytes quantization might need more testing for quality.
-    # If you encounter OOM, uncomment below lines for 8-bit if compatible and test:
-    # quantization_config=BitsAndBytesConfig(load_in_8bit=True),
-    device_map="auto",
-    token=hf_token
 )
-translator = pipeline("translation", model=nllb_model, tokenizer=nllb_tokenizer)
-# --- Functions ---
-# Extract PDF text
-def extract_text_from_pdf(file_path):
-    text = ""
-    doc = fitz.open(file_path)
-    for page in doc:
-        text += page.get_text()
-    return text
-# Upload data file handler
-def process_file(file):
-    global index, doc_texts
-    if file is None:
-        return "Please upload a file to process.", gr.Dropdown.update(choices=["English", "Hindi", "Marathi", "Punjabi"], value="English", interactive=False), gr.Textbox.update(interactive=False), gr.Button.update(interactive=False)
-    filename = file.name
-    if filename.endswith(".pdf"):
-        text = extract_text_from_pdf(file.name)
-    elif filename.endswith(".txt"):
-        with open(file.name, "r", encoding="utf-8") as f:
-            text = f.read()
-    else:
-        return "Upload the correct files (PDF or TXT).", gr.Dropdown.update(choices=["English", "Hindi", "Marathi", "Punjabi"], value="English", interactive=False), gr.Textbox.update(interactive=False), gr.Button.update(interactive=False)
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
-    doc_texts = text_splitter.split_text(text)
-    # Ensure embeddings are float32 for FAISS if `bnb_8bit_compute_dtype` changes it
-    embeddings = embed_model.encode(doc_texts).astype(np.float32)
-    dim = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dim)
-    index.add(np.array(embeddings))
-    return "Files uploaded and processed successfully!", gr.Dropdown.update(interactive=True), gr.Textbox.update(interactive=True), gr.Button.update(interactive=True)
-# Retrieve context using FAISS
-def get_context(question, k=3):
-    question_embedding = embed_model.encode([question]).astype(np.float32)
-    _, I = index.search(np.array(question_embedding), k)
-    return "\n".join([doc_texts[i] for i in I[0]])
-# Answers with the Translation Option
-def generate_answer(question, lang_choice):
-    if index is None:
-        return "Please upload and process a file first."
-    context = get_context(question)
-    # Using chat template to ensure proper formatting for TinyLlama
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant. Answer strictly based on the context."},
-        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
-    ]
-    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    try:
-        result = llm(prompt)
-        # Extract the answer from the generated text, often it's after the last "Assistant" turn
-        # This can be tricky with conversational models, you might need to adjust extraction logic
-        # based on exact model output.
-        generated_text = result[0]['generated_text']
-        # A common way to get the response after the final user turn:
-        answer = generated_text.split("assistant\n")[-1].strip()
-        # For TinyLlama-1.1B-Chat-v1.0, it might be safer to parse the entire output or use `max_new_tokens` carefully
-        # to ensure it doesn't repeat the prompt too much.
-        if lang_choice != "English":
-            src_lang = "eng_Latn"
-            tgt_lang = lang_map.get(lang_choice, "eng_Latn")
-            translated = translator(answer, src_lang=src_lang, tgt_lang=tgt_lang)
-            return translated[0]['translation_text']
-        else:
-            return answer
-    except Exception as e:
-        return f"Error generating answer: {str(e)}"
-# --- Gradio UI ---
-with gr.Blocks(title="Multilingual RAG Chatbot with Quantization") as demo:
-    gr.Markdown(
-        """
-        # Multilingual RAG Chatbot
-        Upload your PDF or TXT file, then ask questions. The chatbot will retrieve relevant information
-        and generate an answer, which can then be translated into your chosen language.
-        """
-    )
-    with gr.Row():
-        with gr.Column():
-            file_input = gr.File(label="1. Upload Document (PDF or TXT)", file_types=[".txt", ".pdf"])
-            upload_status = gr.Textbox(label="Processing Status", interactive=False, placeholder="No file uploaded yet.")
-            upload_button = gr.Button("Process Document") # Explicit button to trigger processing
-        with gr.Column():
-            gr.Markdown("---") # Visual separator
-            gr.Markdown("### 2. Ask a Question and Get Answer")
-            question_box = gr.Textbox(label="Your Question", placeholder="e.g., What is the main topic of the document?")
-            lang_dropdown = gr.Dropdown(
-                label="Output Language",
-                choices=["English", "Hindi", "Marathi", "Punjabi"],
-                value="English",
-                interactive=False # Initially disable until file is processed
-            )
-            generate_button = gr.Button("Generate Answer") # Explicit button for generation
-            answer_box = gr.Textbox(label="Answer", interactive=False, lines=5, placeholder="The answer will appear here...")
-    # Event handling
-    upload_button.click(
-        fn=process_file,
-        inputs=file_input,
-        outputs=[upload_status, lang_dropdown, question_box, generate_button] # Enable other components on success
-    )
-    generate_button.click(
-        fn=generate_answer,
-        inputs=[question_box, lang_dropdown],
-        outputs=answer_box
-    )
-    # Also allow 'Enter' key for question box
-    question_box.submit(
-        fn=generate_answer,
-        inputs=[question_box, lang_dropdown],
-        outputs=answer_box
-    )
-demo.launch()

 import gradio as gr
 import faiss
 import torch
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
+# ---------- Load models ----------
+embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+gen_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
+gen_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype=torch.float32)
+# Example: EN->HI
+trans_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
+trans_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
+# ---------- Sample docs + FAISS index ----------
+documents = [
+    "The Taj Mahal is an ivory-white marble mausoleum in India.",
+    "ChatGPT is a large language model developed by OpenAI.",
+    "RAG combines retrieval-based and generation-based approaches."
+]
+doc_embeddings = embed_model.encode(documents, convert_to_tensor=True)
+index = faiss.IndexFlatL2(doc_embeddings.shape[1])
+index.add(doc_embeddings.cpu().numpy())
+# ---------- RAG Function ----------
+def rag_translate(query, target_lang='hi'):
+    query_vec = embed_model.encode([query])
+    _, top_indices = index.search(query_vec, k=1)
+    retrieved_doc = documents[top_indices[0][0]]
+    prompt = f"Context: {retrieved_doc}\nQuestion: {query}\nAnswer:"
+    inputs = gen_tokenizer(prompt, return_tensors="pt")
+    outputs = gen_model.generate(**inputs, max_new_tokens=64)
+    answer_en = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Translate if requested
+    if target_lang and target_lang != 'en':
+        trans_inputs = trans_tokenizer(answer_en, return_tensors="pt", truncation=True)
+        trans_output = trans_model.generate(**trans_inputs)
+        translated = trans_tokenizer.decode(trans_output[0], skip_special_tokens=True)
+        return f"🔍 Answer:\n{answer_en}\n\n🌐 Translated:\n{translated}"
+    return f"🔍 Answer:\n{answer_en}"
+# ---------- Gradio UI ----------
+iface = gr.Interface(
+    fn=rag_translate,
+    inputs=[
+        gr.Textbox(label="Ask a Question"),
+        gr.Dropdown(choices=["en", "hi", "fr", "es"], value="hi", label="Target Language")
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="🧠 RAG + 🌍 Translator",
+    description="A lightweight RAG system with answer translation. Powered by Phi-2 + MiniLM + Opus MT."
 )
+iface.launch()