manoj1hcl commited on
Commit
aca5cc5
·
verified ·
1 Parent(s): bbf1ecd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -74
app.py CHANGED
@@ -1,117 +1,122 @@
1
  import os
2
- import pdfplumber
3
  from dotenv import load_dotenv
4
  import gradio as gr
 
5
  from langchain_openai import OpenAIEmbeddings, ChatOpenAI
6
  from langchain_chroma import Chroma
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain.chains import RetrievalQA
9
- from langchain.llms import OpenAI
10
- from langchain.document_loaders import TextLoader
11
- from langchain.docstore.document import Document
12
- from transformers import AutoTokenizer
13
- from langchain.document_loaders import PyPDFLoader
14
  from langchain.memory import ConversationBufferMemory
15
  from langchain.chains import ConversationalRetrievalChain
16
 
17
-
18
-
19
- # price is a factor for our company, so we're going to use a low cost model
20
  MODEL = "gpt-4o-mini"
21
- db_name = "vector_db"
22
-
23
- # Load environment variables in a file called .env
24
 
25
  load_dotenv(override=True)
26
 
 
 
27
 
28
  def process_pdf(pdf_file):
29
  try:
30
- loader = PyPDFLoader(pdf_file.name)
 
 
 
 
 
 
31
  pages = loader.load()
 
32
  if not pages:
33
- raise ValueError("No text found in padf.")
34
- text_splitter = RecursiveCharacterTextSplitter(
 
35
  chunk_size=500,
36
- chunk_overlap=50
37
  )
38
- chunks = text_splitter.split_documents(pages)
39
- if not chunks:
40
- raise ValueError("Unable to split the PDF into chunks.")
41
  if not chunks:
42
- raise ValueError(f"ERROR: File is ecrypted/protected No text chunks generated fro {pdf_file}.")
 
 
43
  embeddings = OpenAIEmbeddings()
44
- #print(chunks)
45
- if os.path.exists(db_name):
46
- Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
47
-
48
- # Embed the chunks with OpenAI Embeddings
49
-
50
- vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
51
-
52
- # Sample embedding dimension
53
- collection = vectorstore._collection
54
- sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
55
- dimensions = len(sample_embedding)
56
- print(f"The vectors have {dimensions:,} dimensions")
57
-
58
- # Create the OpenAI Chat Model
59
- llm = ChatOpenAI(temperature=0.7, model=MODEL) # Or another model
60
-
61
- # Set up conversation memory
62
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
63
-
64
- # Set up the retriever (vector store)
65
- retriever = vectorstore.as_retriever()
66
-
67
- # Set up the Conversational Retrieval Chain
68
- conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
69
-
70
- # Return the conversation chain
71
-
72
  return conversation_chain
 
73
  except Exception as e:
74
  raise RuntimeError(f"PDF processing failed: {str(e)}")
75
 
76
- # Function to upload PDF
77
  def upload_pdf(file):
78
  global chain
79
  if file is None:
80
  chain = None
81
- return "pleae upload the file!"
82
  chain = process_pdf(file)
83
- return "processed the file ask questions"
 
84
 
85
- # ask_question function
86
  def ask_question(message, history):
87
  if chain is None:
88
- return "upload the pdf first"
89
- else:
90
- try:
91
- result = chain.invoke({"question":message})
92
- answer = result.get("answer", "No answer found.")
93
- except Exception as e:
94
- answer = f"Error:{str(e)}"
95
- history.append((message, answer))
 
 
 
 
 
96
  return history, history, ""
97
- # Building Gradio Interface
 
98
  with gr.Blocks() as demo:
99
- gr.Markdown("## Chat with your pdf!!")
100
- # File uploader
101
  file_input = gr.File(label="Upload your PDF", file_types=[".pdf"])
102
- # Status text
103
  status = gr.Textbox(label="Status", interactive=False)
104
 
105
- chatbot = gr.Chatbot(label="Chat history!!!")
106
- msg=gr.Textbox(label="Ask anything related to pdf...")
107
  clear = gr.Button("Clear chat")
108
 
109
- state = gr.State([])
110
 
111
  file_input.change(upload_pdf, inputs=[file_input], outputs=[status])
112
- msg.submit(ask_question, [msg, state], [chatbot, state, msg])
113
- clear.click(lambda: ([],[]), None, [chatbot, state])
114
- chain = None # global QA chain
115
 
116
- # Launch the app
117
- demo.launch(inline=False)
 
1
  import os
2
+ import shutil
3
  from dotenv import load_dotenv
4
  import gradio as gr
5
+
6
  from langchain_openai import OpenAIEmbeddings, ChatOpenAI
7
  from langchain_chroma import Chroma
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from langchain_community.document_loaders import PyPDFLoader
 
 
 
 
 
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain.chains import ConversationalRetrievalChain
12
 
13
+ # Low-cost model
 
 
14
  MODEL = "gpt-4o-mini"
15
+ DB_DIR = "vector_db"
 
 
16
 
17
  load_dotenv(override=True)
18
 
19
+ chain = None # global chain (OK for single-user; see note below)
20
+
21
 
22
  def process_pdf(pdf_file):
23
  try:
24
+ if not os.getenv("OPENAI_API_KEY"):
25
+ raise RuntimeError(
26
+ "OPENAI_API_KEY is not set. Add it to your environment or as a Secret on HF Spaces."
27
+ )
28
+
29
+ file_path = pdf_file.name # gr.File gives a temp file with .name path
30
+ loader = PyPDFLoader(file_path)
31
  pages = loader.load()
32
+
33
  if not pages:
34
+ raise ValueError("No text found in PDF (may be scanned or protected).")
35
+
36
+ splitter = RecursiveCharacterTextSplitter(
37
  chunk_size=500,
38
+ chunk_overlap=50,
39
  )
40
+ chunks = splitter.split_documents(pages)
41
+
 
42
  if not chunks:
43
+ raise ValueError("Unable to split PDF into chunks (empty/protected PDF).")
44
+
45
+ # Embeddings (you can also specify: model="text-embedding-3-small")
46
  embeddings = OpenAIEmbeddings()
47
+
48
+ # Reset persisted DB each upload
49
+ if os.path.exists(DB_DIR):
50
+ shutil.rmtree(DB_DIR, ignore_errors=True)
51
+
52
+ vectorstore = Chroma.from_documents(
53
+ documents=chunks,
54
+ embedding=embeddings,
55
+ persist_directory=DB_DIR,
56
+ )
57
+
58
+ llm = ChatOpenAI(model=MODEL, temperature=0.2)
59
+
60
+ memory = ConversationBufferMemory(
61
+ memory_key="chat_history",
62
+ return_messages=True,
63
+ )
64
+
65
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
66
+
67
+ conversation_chain = ConversationalRetrievalChain.from_llm(
68
+ llm=llm,
69
+ retriever=retriever,
70
+ memory=memory,
71
+ )
72
+
 
 
73
  return conversation_chain
74
+
75
  except Exception as e:
76
  raise RuntimeError(f"PDF processing failed: {str(e)}")
77
 
78
+
79
  def upload_pdf(file):
80
  global chain
81
  if file is None:
82
  chain = None
83
+ return "Please upload a PDF."
84
  chain = process_pdf(file)
85
+ return "PDF processed. Ask questions now."
86
+
87
 
 
88
  def ask_question(message, history):
89
  if chain is None:
90
+ history = history or []
91
+ history.append({"role": "assistant", "content": "Upload the PDF first."})
92
+ return history, history, ""
93
+
94
+ try:
95
+ result = chain.invoke({"question": message})
96
+ answer = result.get("answer", "No answer found.")
97
+ except Exception as e:
98
+ answer = f"Error: {str(e)}"
99
+
100
+ history = history or []
101
+ history.append({"role": "user", "content": message})
102
+ history.append({"role": "assistant", "content": answer})
103
  return history, history, ""
104
+
105
+
106
  with gr.Blocks() as demo:
107
+ gr.Markdown("## Chat with your PDF")
108
+
109
  file_input = gr.File(label="Upload your PDF", file_types=[".pdf"])
 
110
  status = gr.Textbox(label="Status", interactive=False)
111
 
112
+ chatbot = gr.Chatbot(label="Chat history", type="messages")
113
+ msg = gr.Textbox(label="Ask anything related to the PDF...")
114
  clear = gr.Button("Clear chat")
115
 
116
+ state = gr.State([])
117
 
118
  file_input.change(upload_pdf, inputs=[file_input], outputs=[status])
119
+ msg.submit(ask_question, inputs=[msg, state], outputs=[chatbot, state, msg])
120
+ clear.click(lambda: ([], []), inputs=None, outputs=[chatbot, state])
 
121
 
122
+ demo.launch(inline=False)