File size: 4,120 Bytes
9edbb70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2554008
e73bf0f
9edbb70
a64e065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9edbb70
a64e065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9edbb70
a64e065
 
 
9edbb70
 
 
 
 
 
 
 
 
 
 
a64e065
9edbb70
 
 
a64e065
 
 
 
 
 
3394cd2
9edbb70
 
 
 
a64e065
9edbb70
1795471
9edbb70
1795471
a64e065
 
65c53a0
 
a64e065
 
bbf1ecd
 
9edbb70
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import pdfplumber
from dotenv import load_dotenv
import gradio as gr
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from transformers import AutoTokenizer
from langchain.document_loaders import PyPDFLoader
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain



# price is a factor for our company, so we're going to use a low cost model
MODEL = "gpt-4o-mini"
db_name = "vector_db"

# Load environment variables in a file called .env

load_dotenv(override=True)


def process_pdf(pdf_file):
    try:
        loader = PyPDFLoader(pdf_file.name)
        pages = loader.load()
        if not pages:
            raise ValueError("No text found in padf.")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50
        )
        chunks = text_splitter.split_documents(pages)
        if not chunks:
            raise ValueError("Unable to split the PDF into chunks.")
        if not chunks:
            raise ValueError(f"ERROR: File is ecrypted/protected No text chunks generated fro {pdf_file}.")
        embeddings = OpenAIEmbeddings()
        #print(chunks)
        if os.path.exists(db_name):
            Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
    
        # Embed the chunks with OpenAI Embeddings
       
        vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
        
        # Sample embedding dimension
        collection = vectorstore._collection
        sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
        dimensions = len(sample_embedding)
        print(f"The vectors have {dimensions:,} dimensions")
        
        # Create the OpenAI Chat Model
        llm = ChatOpenAI(temperature=0.7, model=MODEL)  # Or another model
      
        # Set up conversation memory
        memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
        
        # Set up the retriever (vector store)
        retriever = vectorstore.as_retriever()
            
        # Set up the Conversational Retrieval Chain
        conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
        
        # Return the conversation chain
    
        return conversation_chain
    except Exception as e:
        raise RuntimeError(f"PDF processing failed: {str(e)}")

# Function to upload PDF
def upload_pdf(file):
    global chain
    if file is None:
        chain = None
        return "pleae upload the file!"
    chain = process_pdf(file)
    return "processed the file ask questions"

# ask_question function
def ask_question(message, history):
    if chain is None:
        return "upload the pdf first"
    else:
        try:
            result = chain.invoke({"question":message})
            answer = result.get("answer", "No answer found.")
        except Exception as e:
            answer = f"Error:{str(e)}"
    history.append((message, answer))
    return history, history, ""
# Building Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## Chat with your pdf!!")
    # File uploader
    file_input = gr.File(label="Upload your PDF", file_types=[".pdf"])
    # Status text
    status = gr.Textbox(label="Status", interactive=False)

    chatbot = gr.Chatbot(label="Chat history!!!")
    msg=gr.Textbox(label="Ask anything related to pdf...")
    clear = gr.Button("Clear chat")

    state = gr.State([]) 

    file_input.change(upload_pdf, inputs=[file_input], outputs=[status])
    msg.submit(ask_question, [msg, state], [chatbot, state, msg])
    clear.click(lambda: ([],[]), None, [chatbot, state])
    chain = None  # global QA chain

# Launch the app
demo.launch(inline=False)