Himanshu kumar Vishwakrma commited on
Commit
6288d51
Β·
1 Parent(s): b96bff0
Files changed (1) hide show
  1. app.py +57 -41
app.py CHANGED
@@ -10,36 +10,29 @@ from langchain.chains import ConversationalRetrievalChain
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain_community.llms import HuggingFaceHub
12
 
13
- # Initialize global variables
14
  conversation = None
15
  chat_history = []
16
- process_complete = False
17
 
18
- def get_pdf_text(pdf_file):
19
- """Extract text from PDF"""
20
- reader = PdfReader(pdf_file)
21
  text = ""
22
- for page in reader.pages:
23
- text += page.extract_text() or ""
24
- return text
25
-
26
- def get_docx_text(docx_file):
27
- """Extract text from DOCX"""
28
- doc = docx.Document(docx_file)
29
- return "\n".join([para.text for para in doc.paragraphs])
30
-
31
- def get_files_text(files):
32
- """Process multiple files"""
33
- text = ""
34
- for file in files:
35
- if file.name.endswith(".pdf"):
36
- text += get_pdf_text(file)
37
- elif file.name.endswith(".docx"):
38
- text += get_docx_text(file)
39
- return text
40
 
41
  def get_text_chunks(text):
42
  """Split text into chunks"""
 
 
 
43
  text_splitter = CharacterTextSplitter(
44
  separator="\n",
45
  chunk_size=1000,
@@ -49,51 +42,74 @@ def get_text_chunks(text):
49
  return text_splitter.split_text(text)
50
 
51
  def get_vectorstore(text_chunks):
52
- """Create vector store from text"""
 
 
 
53
  embeddings = HuggingFaceEmbeddings()
54
- return FAISS.from_texts(text_chunks, embeddings)
55
 
56
  def get_conversation_chain(vectorstore):
57
- """Initialize conversation chain"""
 
 
58
  llm = HuggingFaceHub(
59
- repo_id="google/flan-t5-large",
60
- model_kwargs={"temperature": 0.5, "max_length": 512}
61
  )
 
62
  memory = ConversationBufferMemory(
63
  memory_key='chat_history',
64
  return_messages=True
65
  )
66
- return ConversationalRetrievalChain.from_llm(
 
67
  llm=llm,
68
  retriever=vectorstore.as_retriever(),
69
  memory=memory
70
  )
 
71
 
72
  def process_files(files):
73
  """Handle file processing"""
74
- global conversation, process_complete
 
75
  if not files:
76
  return "Please upload files first"
77
 
78
  try:
79
- raw_text = get_files_text(files)
 
 
 
 
 
80
  text_chunks = get_text_chunks(raw_text)
 
 
 
 
81
  vectorstore = get_vectorstore(text_chunks)
82
- conversation = get_conversation_chain(vectorstore)
83
- process_complete = True
 
 
 
84
  return "βœ… Files processed successfully! You can now ask questions."
 
85
  except Exception as e:
86
- return f"❌ Error: {str(e)}"
87
 
88
  def ask_question(question, history):
89
  """Handle question answering"""
90
  global conversation, chat_history
91
- if not process_complete:
92
- return history + [(question, "Please process files first")]
93
 
94
  if not question:
95
  return history
96
 
 
 
 
97
  try:
98
  response = conversation({"question": question})
99
  answer = response["answer"]
@@ -104,16 +120,16 @@ def ask_question(question, history):
104
 
105
  # Gradio Interface
106
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
107
- gr.Markdown("# πŸ“„ PDF/DOCX Chatbot")
108
 
109
  with gr.Row():
110
  with gr.Column(scale=1):
111
  file_input = gr.File(
112
- label="Upload Files",
113
- file_types=[".pdf", ".docx"],
114
  file_count="multiple"
115
  )
116
- process_btn = gr.Button("Process Files")
117
  status = gr.Textbox(label="Status")
118
 
119
  with gr.Column(scale=2):
@@ -143,6 +159,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
143
  outputs=[chatbot]
144
  )
145
 
146
- if __name__ == "__main__":
147
  load_dotenv()
148
  demo.launch()
 
10
  from langchain.memory import ConversationBufferMemory
11
  from langchain_community.llms import HuggingFaceHub
12
 
13
+ # Initialize conversation state
14
  conversation = None
15
  chat_history = []
 
16
 
17
+ def get_pdf_text(pdf_docs):
18
+ """Improved PDF text extraction with error handling"""
 
19
  text = ""
20
+ for pdf in pdf_docs:
21
+ try:
22
+ pdf_reader = PdfReader(pdf)
23
+ for page in pdf_reader.pages:
24
+ page_text = page.extract_text()
25
+ if page_text: # Only add if text was extracted
26
+ text += page_text + "\n"
27
+ except Exception as e:
28
+ print(f"Error reading PDF: {str(e)}")
29
+ return text if text.strip() else None
 
 
 
 
 
 
 
 
30
 
31
  def get_text_chunks(text):
32
  """Split text into chunks"""
33
+ if not text:
34
+ return []
35
+
36
  text_splitter = CharacterTextSplitter(
37
  separator="\n",
38
  chunk_size=1000,
 
42
  return text_splitter.split_text(text)
43
 
44
  def get_vectorstore(text_chunks):
45
+ """Create vector store using HuggingFace embeddings"""
46
+ if not text_chunks:
47
+ return None
48
+
49
  embeddings = HuggingFaceEmbeddings()
50
+ return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
51
 
52
  def get_conversation_chain(vectorstore):
53
+ """Create conversation chain with HuggingFace model"""
54
+ global conversation
55
+
56
  llm = HuggingFaceHub(
57
+ repo_id="google/flan-t5-xxl",
58
+ model_kwargs={"temperature":0.5, "max_length":512}
59
  )
60
+
61
  memory = ConversationBufferMemory(
62
  memory_key='chat_history',
63
  return_messages=True
64
  )
65
+
66
+ conversation = ConversationalRetrievalChain.from_llm(
67
  llm=llm,
68
  retriever=vectorstore.as_retriever(),
69
  memory=memory
70
  )
71
+ return conversation
72
 
73
  def process_files(files):
74
  """Handle file processing"""
75
+ global conversation, chat_history
76
+
77
  if not files:
78
  return "Please upload files first"
79
 
80
  try:
81
+ # Get PDF text
82
+ raw_text = get_pdf_text(files)
83
+ if not raw_text:
84
+ return "❌ Could not extract text from PDF(s). The file may be scanned or corrupted."
85
+
86
+ # Get text chunks
87
  text_chunks = get_text_chunks(raw_text)
88
+ if not text_chunks:
89
+ return "❌ No valid text chunks could be created."
90
+
91
+ # Create vector store
92
  vectorstore = get_vectorstore(text_chunks)
93
+ if not vectorstore:
94
+ return "❌ Failed to create vector store."
95
+
96
+ # Create conversation chain
97
+ get_conversation_chain(vectorstore)
98
  return "βœ… Files processed successfully! You can now ask questions."
99
+
100
  except Exception as e:
101
+ return f"❌ Error processing files: {str(e)}"
102
 
103
  def ask_question(question, history):
104
  """Handle question answering"""
105
  global conversation, chat_history
 
 
106
 
107
  if not question:
108
  return history
109
 
110
+ if not conversation:
111
+ return history + [(question, "Please process files first")]
112
+
113
  try:
114
  response = conversation({"question": question})
115
  answer = response["answer"]
 
120
 
121
  # Gradio Interface
122
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
123
+ gr.Markdown("# πŸ“„ Chat with PDFs")
124
 
125
  with gr.Row():
126
  with gr.Column(scale=1):
127
  file_input = gr.File(
128
+ label="Upload PDFs",
129
+ file_types=[".pdf"],
130
  file_count="multiple"
131
  )
132
+ process_btn = gr.Button("Process")
133
  status = gr.Textbox(label="Status")
134
 
135
  with gr.Column(scale=2):
 
159
  outputs=[chatbot]
160
  )
161
 
162
+ if __name__ == '__main__':
163
  load_dotenv()
164
  demo.launch()