Spaces:
Sleeping
Sleeping
File size: 4,737 Bytes
074614d 5b4c66c ff0995c 074614d ff0995c 074614d 5b4c66c 6288d51 074614d 5b4c66c 6288d51 ff0995c 6288d51 5b4c66c ff0995c 074614d 6288d51 ff0995c 074614d ff0995c 074614d ff0995c 6288d51 ff0995c 6288d51 ff0995c 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 6288d51 ff0995c 074614d ff0995c 6288d51 ff0995c 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 6288d51 074614d 5b4c66c 6288d51 074614d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import gradio as gr
from PyPDF2 import PdfReader
import docx
import os
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import HuggingFaceHub
# Initialize conversation state
conversation = None
chat_history = []
def get_pdf_text(pdf_docs):
"""Improved PDF text extraction with error handling"""
text = ""
for pdf in pdf_docs:
try:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text: # Only add if text was extracted
text += page_text + "\n"
except Exception as e:
print(f"Error reading PDF: {str(e)}")
return text if text.strip() else None
def get_text_chunks(text):
"""Split text into chunks"""
if not text:
return []
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
return text_splitter.split_text(text)
def get_vectorstore(text_chunks):
"""Create vector store using HuggingFace embeddings"""
if not text_chunks:
return None
embeddings = HuggingFaceEmbeddings()
return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
def get_conversation_chain(vectorstore):
"""Create conversation chain with HuggingFace model"""
global conversation
llm = HuggingFaceHub(
repo_id="google/flan-t5-xxl",
model_kwargs={"temperature":0.5, "max_length":512}
)
memory = ConversationBufferMemory(
memory_key='chat_history',
return_messages=True
)
conversation = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(),
memory=memory
)
return conversation
def process_files(files):
"""Handle file processing"""
global conversation, chat_history
if not files:
return "Please upload files first"
try:
# Get PDF text
raw_text = get_pdf_text(files)
if not raw_text:
return "β Could not extract text from PDF(s). The file may be scanned or corrupted."
# Get text chunks
text_chunks = get_text_chunks(raw_text)
if not text_chunks:
return "β No valid text chunks could be created."
# Create vector store
vectorstore = get_vectorstore(text_chunks)
if not vectorstore:
return "β Failed to create vector store."
# Create conversation chain
get_conversation_chain(vectorstore)
return "β
Files processed successfully! You can now ask questions."
except Exception as e:
return f"β Error processing files: {str(e)}"
def ask_question(question, history):
"""Handle question answering"""
global conversation, chat_history
if not question:
return history
if not conversation:
return history + [(question, "Please process files first")]
try:
response = conversation({"question": question})
answer = response["answer"]
chat_history = response["chat_history"]
return history + [(question, answer)]
except Exception as e:
return history + [(question, f"Error: {str(e)}")]
# Gradio Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# π Chat with PDFs")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload PDFs",
file_types=[".pdf"],
file_count="multiple"
)
process_btn = gr.Button("Process")
status = gr.Textbox(label="Status")
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Conversation")
question = gr.Textbox(
label="Your Question",
placeholder="Ask about your documents..."
)
submit_btn = gr.Button("Submit")
# Event handlers
process_btn.click(
process_files,
inputs=file_input,
outputs=status
)
submit_btn.click(
ask_question,
inputs=[question, chatbot],
outputs=[chatbot]
)
question.submit(
ask_question,
inputs=[question, chatbot],
outputs=[chatbot]
)
if __name__ == '__main__':
load_dotenv()
demo.launch() |