Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| import os | |
| import gradio as gr | |
| # Load environment variables | |
| load_dotenv() | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| # Initialize OpenAI client | |
| client = OpenAI(api_key=api_key) | |
| # File Path (replace "sample.pdf" with the name of your PDF file) | |
| PDF_FILE = "Resume_Pratiksha.pdf" #"Company_HR_Policy.pdf" # Ensure this file is in the same directory as app.py | |
| # Utility Functions | |
| def load_pdf(file_path): | |
| """Extract text from a PDF file.""" | |
| try: | |
| with fitz.open(file_path) as doc: | |
| return "".join([page.get_text() for page in doc]) | |
| except Exception as e: | |
| return f"Error reading PDF: {e}" | |
| def split_text(text, chunk_size=1000, chunk_overlap=20): | |
| """Split text into manageable chunks.""" | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False | |
| ) | |
| return text_splitter.create_documents([text]) | |
| def create_and_load_db(chunks, persist_directory="pdf_embeddings"): | |
| """Create and load ChromaDB.""" | |
| embeddings = HuggingFaceEmbeddings() | |
| vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory) | |
| vectordb.persist() | |
| return Chroma(persist_directory=persist_directory, embedding_function=embeddings) | |
| def generate_response(context, question): | |
| """Generate a response using OpenAI.""" | |
| try: | |
| messages = [ | |
| {"role": "system", "content": "You are an assistant that answers questions based on PDF content."}, | |
| {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"} | |
| ] | |
| response = client.chat.completions.create( | |
| model="gpt-3.5-turbo", # Replace with preferred model | |
| messages=messages, | |
| max_tokens=150, | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| return f"Error generating response: {e}" | |
| def process_question(question): | |
| if not question: | |
| return "Please provide a question." | |
| # Step 1: Load and extract text from the PDF | |
| pdf_text = load_pdf(PDF_FILE) | |
| if pdf_text.startswith("Error"): | |
| return pdf_text | |
| # Step 2: Split the text into chunks | |
| chunks = split_text(pdf_text) | |
| # Step 3: Create and load ChromaDB | |
| vectordb = create_and_load_db(chunks) | |
| # Step 4: Perform similarity search | |
| try: | |
| docs = vectordb.similarity_search(question) | |
| if not docs: | |
| return "No relevant information found." | |
| # Step 5: Generate a response using the retrieved context | |
| context = docs[0].page_content | |
| response = generate_response(context, question) | |
| return response | |
| except Exception as e: | |
| return f"Error during similarity search or response generation: {str(e)}" | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# PDF Chatbot") | |
| with gr.Row(): | |
| question_input = gr.Textbox(label="Ask a Question", placeholder="Enter your question here...") | |
| output = gr.Textbox(label="Answer", lines=5, interactive=False) | |
| submit_button = gr.Button("Submit") | |
| submit_button.click(process_question, inputs=[question_input], outputs=output) | |
| if __name__ == "__main__": | |
| demo.launch() | |