Spaces:

pradeepsengarr
/

RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Jun 11, 2025

Commit

554fa87

verified ·

1 Parent(s): 63609d2

Create app.py

Browse files

Files changed (1) hide show

app.py +212 -0

app.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import gradio as gr
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM, BitsAndBytesConfig
+import faiss
+import numpy as np
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import fitz
+import os
+import torch
+# --- Global Variables ---
+index = None
+doc_texts = []
+hf_token = os.environ.get("HF_TOKEN") # Get the Hugging Face token
+# Language Codes for given languages
+lang_map = {
+    "English": "eng_Latn",
+    "Hindi": "hin_Deva",
+    "Marathi": "mar_Deva",
+    "Punjabi": "pan_Guru"
+}
+# --- Model Loading (will be loaded once on Space startup) ---
+# For Embedding - Using a smaller, more CPU-friendly SentenceTransformer model
+# This model is generally small enough that quantization isn't critically needed for it.
+embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", token=hf_token)
+# For LLM - Using "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+llm_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+tokenizer = AutoTokenizer.from_pretrained(llm_model_id, token=hf_token)
+# --- Quantization Configuration ---
+# Choose one of the following quantization methods based on your needs and resources:
+# Option 1: 8-bit quantization (generally good balance of performance and memory)
+# Requires `bitsandbytes` library: pip install bitsandbytes accelerate
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+    bnb_8bit_compute_dtype=torch.float16 # Use float16 for compute if possible
+)
+# Option 2: 4-bit quantization (most aggressive memory reduction, potential small accuracy hit)
+# Requires `bitsandbytes` library: pip install bitsandbytes accelerate
+# quantization_config = BitsAndBytesConfig(
+#     load_in_4bit=True,
+#     bnb_4bit_quant_type="nf4", # NormalFloat 4-bit
+#     bnb_4bit_compute_dtype=torch.float16, # Use float16 for compute if possible
+#     bnb_4bit_use_double_quant=True, # Double quantization for slightly better precision
+# )
+# Load the LLM with quantization
+model = AutoModelForCausalLM.from_pretrained(
+    llm_model_id,
+    quantization_config=quantization_config, # Apply the quantization config
+    device_map="auto", # Automatically places model parts, often on CPU for 8bit/4bit on CPU-only Spaces
+    token=hf_token
+)
+llm = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=300,
+    do_sample=True,
+    temperature=0.7,
+)
+# Load a smaller FB Translation Model
+# NLLB-200M is still relatively big. Quantizing it can be tricky for Seq2Seq models
+# with `bitsandbytes` directly for generation quality. If OOM issues persist,
+# consider a much smaller NLLB variant, or a different approach for translation.
+nllb_id = "facebook/nllb-200-distilled-600M" # This model is 600M params, can still be large
+nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_id)
+nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
+    nllb_id,
+    # For NLLB, direct bitsandbytes quantization might need more testing for quality.
+    # If you encounter OOM, uncomment below lines for 8-bit if compatible and test:
+    # quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+    device_map="auto",
+    token=hf_token
+)
+translator = pipeline("translation", model=nllb_model, tokenizer=nllb_tokenizer)
+# --- Functions ---
+# Extract PDF text
+def extract_text_from_pdf(file_path):
+    text = ""
+    doc = fitz.open(file_path)
+    for page in doc:
+        text += page.get_text()
+    return text
+# Upload data file handler
+def process_file(file):
+    global index, doc_texts
+    if file is None:
+        return "Please upload a file to process.", gr.Dropdown.update(choices=["English", "Hindi", "Marathi", "Punjabi"], value="English", interactive=False), gr.Textbox.update(interactive=False), gr.Button.update(interactive=False)
+    filename = file.name
+    if filename.endswith(".pdf"):
+        text = extract_text_from_pdf(file.name)
+    elif filename.endswith(".txt"):
+        with open(file.name, "r", encoding="utf-8") as f:
+            text = f.read()
+    else:
+        return "Upload the correct files (PDF or TXT).", gr.Dropdown.update(choices=["English", "Hindi", "Marathi", "Punjabi"], value="English", interactive=False), gr.Textbox.update(interactive=False), gr.Button.update(interactive=False)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
+    doc_texts = text_splitter.split_text(text)
+    # Ensure embeddings are float32 for FAISS if `bnb_8bit_compute_dtype` changes it
+    embeddings = embed_model.encode(doc_texts).astype(np.float32)
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    index.add(np.array(embeddings))
+    return "Files uploaded and processed successfully!", gr.Dropdown.update(interactive=True), gr.Textbox.update(interactive=True), gr.Button.update(interactive=True)
+# Retrieve context using FAISS
+def get_context(question, k=3):
+    question_embedding = embed_model.encode([question]).astype(np.float32)
+    _, I = index.search(np.array(question_embedding), k)
+    return "\n".join([doc_texts[i] for i in I[0]])
+# Answers with the Translation Option
+def generate_answer(question, lang_choice):
+    if index is None:
+        return "Please upload and process a file first."
+    context = get_context(question)
+    # Using chat template to ensure proper formatting for TinyLlama
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant. Answer strictly based on the context."},
+        {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
+    ]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    try:
+        result = llm(prompt)
+        # Extract the answer from the generated text, often it's after the last "Assistant" turn
+        # This can be tricky with conversational models, you might need to adjust extraction logic
+        # based on exact model output.
+        generated_text = result[0]['generated_text']
+        # A common way to get the response after the final user turn:
+        answer = generated_text.split("assistant\n")[-1].strip()
+        # For TinyLlama-1.1B-Chat-v1.0, it might be safer to parse the entire output or use `max_new_tokens` carefully
+        # to ensure it doesn't repeat the prompt too much.
+        if lang_choice != "English":
+            src_lang = "eng_Latn"
+            tgt_lang = lang_map.get(lang_choice, "eng_Latn")
+            translated = translator(answer, src_lang=src_lang, tgt_lang=tgt_lang)
+            return translated[0]['translation_text']
+        else:
+            return answer
+    except Exception as e:
+        return f"Error generating answer: {str(e)}"
+# --- Gradio UI ---
+with gr.Blocks(title="Multilingual RAG Chatbot with Quantization") as demo:
+    gr.Markdown(
+        """
+        # Multilingual RAG Chatbot
+        Upload your PDF or TXT file, then ask questions. The chatbot will retrieve relevant information
+        and generate an answer, which can then be translated into your chosen language.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(label="1. Upload Document (PDF or TXT)", file_types=[".txt", ".pdf"])
+            upload_status = gr.Textbox(label="Processing Status", interactive=False, placeholder="No file uploaded yet.")
+            upload_button = gr.Button("Process Document") # Explicit button to trigger processing
+        with gr.Column():
+            gr.Markdown("---") # Visual separator
+            gr.Markdown("### 2. Ask a Question and Get Answer")
+            question_box = gr.Textbox(label="Your Question", placeholder="e.g., What is the main topic of the document?")
+            lang_dropdown = gr.Dropdown(
+                label="Output Language",
+                choices=["English", "Hindi", "Marathi", "Punjabi"],
+                value="English",
+                interactive=False # Initially disable until file is processed
+            )
+            generate_button = gr.Button("Generate Answer") # Explicit button for generation
+            answer_box = gr.Textbox(label="Answer", interactive=False, lines=5, placeholder="The answer will appear here...")
+    # Event handling
+    upload_button.click(
+        fn=process_file,
+        inputs=file_input,
+        outputs=[upload_status, lang_dropdown, question_box, generate_button] # Enable other components on success
+    )
+    generate_button.click(
+        fn=generate_answer,
+        inputs=[question_box, lang_dropdown],
+        outputs=answer_box
+    )
+    # Also allow 'Enter' key for question box
+    question_box.submit(
+        fn=generate_answer,
+        inputs=[question_box, lang_dropdown],
+        outputs=answer_box
+    )
+demo.launch()