Spaces:
Build error
Build error
File size: 3,539 Bytes
e07782d 4af86e4 e07782d 405cac8 e07782d 4af86e4 e07782d 4af86e4 e07782d f554d22 bde4f3f f554d22 e07782d f554d22 e07782d f554d22 e07782d bbc9430 e07782d bbc9430 e07782d bbc9430 e07782d 405cac8 e07782d 1c69765 e07782d bbc9430 405cac8 f554d22 bbc9430 e07782d 1c69765 f554d22 bbc9430 129b7f2 bbc9430 1c69765 129b7f2 bbc9430 1c69765 129b7f2 bbc9430 1c69765 bbc9430 f554d22 bbc9430 4af86e4 bbc9430 f554d22 bbc9430 1c69765 405cac8 f4db732 405cac8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | import fitz # PyMuPDF
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import OpenAI
from dotenv import load_dotenv
import os
import gradio as gr
# Load environment variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Initialize OpenAI client
client = OpenAI(api_key=api_key)
# File Path (replace "sample.pdf" with the name of your PDF file)
PDF_FILE = "Resume_Pratiksha.pdf" #"Company_HR_Policy.pdf" # Ensure this file is in the same directory as app.py
# Utility Functions
def load_pdf(file_path):
"""Extract text from a PDF file."""
try:
with fitz.open(file_path) as doc:
return "".join([page.get_text() for page in doc])
except Exception as e:
return f"Error reading PDF: {e}"
def split_text(text, chunk_size=1000, chunk_overlap=20):
"""Split text into manageable chunks."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False
)
return text_splitter.create_documents([text])
def create_and_load_db(chunks, persist_directory="pdf_embeddings"):
"""Create and load ChromaDB."""
embeddings = HuggingFaceEmbeddings()
vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
vectordb.persist()
return Chroma(persist_directory=persist_directory, embedding_function=embeddings)
def generate_response(context, question):
"""Generate a response using OpenAI."""
try:
messages = [
{"role": "system", "content": "You are an assistant that answers questions based on PDF content."},
{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"}
]
response = client.chat.completions.create(
model="gpt-3.5-turbo", # Replace with preferred model
messages=messages,
max_tokens=150,
)
return response.choices[0].message.content.strip()
except Exception as e:
return f"Error generating response: {e}"
def process_question(question):
if not question:
return "Please provide a question."
# Step 1: Load and extract text from the PDF
pdf_text = load_pdf(PDF_FILE)
if pdf_text.startswith("Error"):
return pdf_text
# Step 2: Split the text into chunks
chunks = split_text(pdf_text)
# Step 3: Create and load ChromaDB
vectordb = create_and_load_db(chunks)
# Step 4: Perform similarity search
try:
docs = vectordb.similarity_search(question)
if not docs:
return "No relevant information found."
# Step 5: Generate a response using the retrieved context
context = docs[0].page_content
response = generate_response(context, question)
return response
except Exception as e:
return f"Error during similarity search or response generation: {str(e)}"
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# PDF Chatbot")
with gr.Row():
question_input = gr.Textbox(label="Ask a Question", placeholder="Enter your question here...")
output = gr.Textbox(label="Answer", lines=5, interactive=False)
submit_button = gr.Button("Submit")
submit_button.click(process_question, inputs=[question_input], outputs=output)
if __name__ == "__main__":
demo.launch()
|