import gradio as gr import torch import spaces from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings import PyPDF2 from docx import Document class ResumeRAG: def __init__(self): self.has_cuda = torch.cuda.is_available() self.device = "cuda" if self.has_cuda else "cpu" print(f"Using device: {self.device}") # Embeddings (small + fast) self.embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": self.device}, ) self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50 ) self.vector_store = None model_name = "mistralai/Mistral-7B-Instruct-v0.2" if not self.has_cuda: raise RuntimeError( "No CUDA GPU detected. Use a GPU Space/ZeroGPU, or switch to a smaller CPU model." ) # 4-bit quantization for GPU efficiency quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) print("Loading tokenizer...") self.tokenizer = AutoTokenizer.from_pretrained(model_name) print("Loading model...") self.model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=quantization_config, device_map="auto", # important for Spaces trust_remote_code=True ) # Ensure pad token exists if self.tokenizer.pad_token_id is None: self.tokenizer.pad_token = self.tokenizer.eos_token def extract_text_from_pdf(self, file_path: str) -> str: try: with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) return "".join([(p.extract_text() or "") for p in reader.pages]) except Exception as e: return f"Error reading PDF: {e}" def extract_text_from_docx(self, file_path: str) -> str: try: doc = Document(file_path) return "\n".join([p.text for p in doc.paragraphs]) except Exception as e: return f"Error reading DOCX: {e}" def process_resume(self, file) -> str: if file is None: return "Please upload a resume file." file_path = file.name if file_path.lower().endswith(".pdf"): text = self.extract_text_from_pdf(file_path) elif file_path.lower().endswith(".docx"): text = self.extract_text_from_docx(file_path) else: return "Unsupported file format. Please upload PDF or DOCX." if text.startswith("Error"): return text if not text.strip(): return "No text could be extracted from the resume." chunks = self.text_splitter.split_text(text) if not chunks: return "No text chunks could be created from the resume." self.vector_store = FAISS.from_texts(chunks, self.embeddings) return f"✅ Resume processed successfully! Extracted {len(chunks)} text chunks." def generate_answer(self, question: str, context: str) -> str: prompt = f"""[INST] You are a helpful assistant analyzing a resume. Context: {context} Question: {question} Answer only from the context. If the answer is not in the context, say it is not in the resume. [/INST]""" inputs = self.tokenizer(prompt, return_tensors="pt") # FIX: move inputs onto the SAME device as the model's embedding weights target_device = self.model.get_input_embeddings().weight.device inputs = {k: v.to(target_device) for k, v in inputs.items()} with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=1024, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, ) text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # If the full prompt is included, return only the last segment if "[/INST]" in text: return text.split("[/INST]")[-1].strip() return text.strip() def query(self, question: str): if self.vector_store is None: return "Please upload a resume first.", "" if not question.strip(): return "Please enter a question.", "" docs = self.vector_store.similarity_search(question, k=3) context = "\n\n".join([d.page_content for d in docs]) answer = self.generate_answer(question, context) if torch.cuda.is_available(): torch.cuda.empty_cache() return answer, context print("Initializing Resume RAG System...") rag_system = ResumeRAG() with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.Markdown( """ # 📄 Resume RAG Q&A System Powered by Mistral-7B + FAISS vector search Upload your resume and ask questions about experience, skills, education, and more. """ ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📤 Upload Resume") file_input = gr.File( label="Upload PDF or DOCX", file_types=[".pdf", ".docx"] ) upload_btn = gr.Button("Process Resume", variant="primary", size="lg") upload_status = gr.Textbox(label="Status", interactive=False) gr.Markdown( """ --- **Example Questions:** - What programming languages does the candidate know? - Summarize the work experience - What is the education background? - List all technical skills """ ) with gr.Column(scale=2): gr.Markdown("### 💬 Ask Questions") question_input = gr.Textbox( label="Your Question", placeholder="e.g., What are the candidate's key skills?", lines=2 ) submit_btn = gr.Button("Get Answer", variant="primary", size="lg") answer_output = gr.Textbox( label="Answer", lines=8, interactive=False ) with gr.Accordion("📚 Retrieved Context", open=False): context_output = gr.Textbox( label="Relevant Resume Sections", lines=6, interactive=False ) # GPU-decorated handler for ZeroGPU/Spaces GPU @spaces.GPU def query_gpu(q): return rag_system.query(q) upload_btn.click( fn=rag_system.process_resume, inputs=[file_input], outputs=[upload_status] ) submit_btn.click( fn=query_gpu, inputs=[question_input], outputs=[answer_output, context_output] ) question_input.submit( fn=query_gpu, inputs=[question_input], outputs=[answer_output, context_output] ) if __name__ == "__main__": demo.launch(share=True)