import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFLoader # Model and Tokenizer MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto") # Ensure PAD token is set correctly if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Load embedding model for RAG embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vector_store = None # Function to process PDF and create vector database def process_pdf(pdf_path): global vector_store loader = PyPDFLoader(pdf_path) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) texts = text_splitter.split_documents(documents) vector_store = FAISS.from_documents(texts, embedding_model) return "PDF successfully processed and indexed." # RAG Query Function def query_rag(message, temperature, max_new_tokens, top_k, repetition_penalty, top_p, system_prompt, history=None): if vector_store is None: return "Please upload and process a PDF first." # Retrieve relevant chunks docs = vector_store.similarity_search(message, k=3) context = "\n".join([doc.page_content for doc in docs]) # Construct prompt instruction = f"<|im_start|>system\n{system_prompt}\n<|im_end|>\n" instruction += f"Relevant context:\n{context}\n" instruction += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n" # Tokenization enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True) input_ids = enc.input_ids.to(device) attention_mask = enc.attention_mask.to(device) # Ensure correct parameter types temperature = float(temperature) max_new_tokens = int(max_new_tokens) top_k = int(top_k) repetition_penalty = float(repetition_penalty) top_p = float(top_p) # Generate response output_ids = model.generate( input_ids, attention_mask=attention_mask, # Fix for attention mask issue do_sample=True, temperature=temperature, max_new_tokens=max_new_tokens, top_k=top_k, repetition_penalty=repetition_penalty, top_p=top_p ) response = tokenizer.decode(output_ids[0], skip_special_tokens=True) return response # Gradio Interface def launch_interface(): with gr.Blocks() as demo: gr.Markdown("## 🤖 RAG Chatbot with DeepSeek") pdf_uploader = gr.File(label="Upload PDF", type="filepath") process_btn = gr.Button("Process PDF") process_output = gr.Textbox(label="Processing Status", interactive=False) process_btn.click(process_pdf, inputs=[pdf_uploader], outputs=[process_output]) # Create chat interface with correct argument handling chatbot = gr.ChatInterface( lambda message, history: query_rag(message, 0.7, 10000, 40, 1.1, 0.95, "You are a helpful assistant.", history) ) demo.launch(share=True) # Enable public link if __name__ == "__main__": launch_interface()