File size: 3,539 Bytes
e07782d
 
 
 
 
4af86e4
e07782d
405cac8
e07782d
 
4af86e4
e07782d
4af86e4
 
e07782d
 
f554d22
bde4f3f
f554d22
e07782d
f554d22
e07782d
 
f554d22
 
e07782d
bbc9430
e07782d
 
 
bbc9430
 
 
 
e07782d
 
 
bbc9430
 
 
 
e07782d
 
 
 
 
 
 
 
 
405cac8
e07782d
 
 
1c69765
e07782d
bbc9430
 
405cac8
f554d22
bbc9430
e07782d
1c69765
f554d22
bbc9430
129b7f2
bbc9430
1c69765
129b7f2
bbc9430
1c69765
129b7f2
bbc9430
1c69765
bbc9430
f554d22
bbc9430
 
4af86e4
bbc9430
 
f554d22
bbc9430
 
 
1c69765
405cac8
 
 
 
 
 
 
 
 
 
 
f4db732
405cac8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import fitz  # PyMuPDF
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import OpenAI
from dotenv import load_dotenv
import os
import gradio as gr

# Load environment variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# Initialize OpenAI client
client = OpenAI(api_key=api_key)

# File Path (replace "sample.pdf" with the name of your PDF file)
PDF_FILE = "Resume_Pratiksha.pdf" #"Company_HR_Policy.pdf"  # Ensure this file is in the same directory as app.py

# Utility Functions
def load_pdf(file_path):
    """Extract text from a PDF file."""
    try:
        with fitz.open(file_path) as doc:
            return "".join([page.get_text() for page in doc])
    except Exception as e:
        return f"Error reading PDF: {e}"

def split_text(text, chunk_size=1000, chunk_overlap=20):
    """Split text into manageable chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False
    )
    return text_splitter.create_documents([text])

def create_and_load_db(chunks, persist_directory="pdf_embeddings"):
    """Create and load ChromaDB."""
    embeddings = HuggingFaceEmbeddings()
    vectordb = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=persist_directory)
    vectordb.persist()
    return Chroma(persist_directory=persist_directory, embedding_function=embeddings)

def generate_response(context, question):
    """Generate a response using OpenAI."""
    try:
        messages = [
            {"role": "system", "content": "You are an assistant that answers questions based on PDF content."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"}
        ]
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",  # Replace with preferred model
            messages=messages,
            max_tokens=150,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"Error generating response: {e}"

def process_question(question):
    if not question:
        return "Please provide a question."

    # Step 1: Load and extract text from the PDF
    pdf_text = load_pdf(PDF_FILE)
    if pdf_text.startswith("Error"):
        return pdf_text

    # Step 2: Split the text into chunks
    chunks = split_text(pdf_text)

    # Step 3: Create and load ChromaDB
    vectordb = create_and_load_db(chunks)

    # Step 4: Perform similarity search
    try:
        docs = vectordb.similarity_search(question)
        if not docs:
            return "No relevant information found."

        # Step 5: Generate a response using the retrieved context
        context = docs[0].page_content
        response = generate_response(context, question)
        return response
    except Exception as e:
        return f"Error during similarity search or response generation: {str(e)}"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# PDF Chatbot")

    with gr.Row():
        question_input = gr.Textbox(label="Ask a Question", placeholder="Enter your question here...")
        output = gr.Textbox(label="Answer", lines=5, interactive=False)

    submit_button = gr.Button("Submit")
    submit_button.click(process_question, inputs=[question_input], outputs=output)

if __name__ == "__main__":
    demo.launch()