Spaces:
Sleeping
Sleeping
File size: 4,927 Bytes
4e2a91b 511c58f 4e2a91b ff894bd 0cded56 c016133 4e2a91b ff894bd c016133 ff894bd c016133 4791579 ff894bd 511c58f ec77632 511c58f ff894bd 8d6dc42 597fa2d 84f5641 597fa2d c016133 0cded56 c016133 0cded56 c016133 0cded56 c016133 219895c c016133 219895c c016133 0cded56 c016133 0cded56 c016133 ff894bd 0cded56 c016133 6a946e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import PyPDF2
import io
import os
from tqdm import tqdm
# Initialize models
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
# Initialize Pinecone with environment variable
PINECONE_API_KEY = os.getenv('a32ec76b-bb29-447c-8acf-72934513d1cd')
pc = Pinecone(api_key=PINECONE_API_KEY)
# Create index if it doesn't exist
if 'pdf-index' not in pc.list_indexes().names():
pc.create_index(
name='pdf-index',
dimension=384, # dimension for 'all-MiniLM-L6-v2'
metric='cosine',
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)
# Connect to index
index = pc.Index('pdf-index')
# Function to extract text from the PDF file using PyPDF2
def process_pdf(file):
# Get the file path from the 'file' attribute (Gradio passes file as a temporary file)
pdf_path = file.name
# Open the PDF file in read-binary mode
with open(pdf_path, 'rb') as f:
# Create a PdfReader object
pdf_reader = PyPDF2.PdfReader(f)
# Initialize an empty string to hold the extracted text
pdf_content = ""
# Loop through all pages in the PDF and extract text
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
pdf_content += page.extract_text() # Extract text from each page
return pdf_content
pdf_file = io.BytesIO(pdf_content)
reader = PyPDF2.PdfReader(pdf_file)
# Extract text from PDF
text_chunks = []
for page in reader.pages:
text = page.extract_text()
# Split into smaller chunks (roughly 1000 characters each)
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
text_chunks.extend(chunks)
# Create embeddings and upload to Pinecone
processed_chunks = 0
for i, chunk in enumerate(text_chunks):
try:
# Create embedding
embedding = embeddings_model.encode(chunk)
# Upload to Pinecone
index.upsert(
vectors=[(
f"{file.name}_chunk_{i}",
embedding.tolist(),
{
'file_name': file.name,
'chunk_num': i,
'text': chunk
}
)]
)
processed_chunks += 1
except Exception as e:
print(f"Error processing chunk {i}: {str(e)}")
return f"Successfully processed {processed_chunks} chunks from {file.name}"
def process_multiple_pdfs(files):
results = []
for file in files:
result = process_pdf(file)
results.append(result)
return "\n".join(results)
def search_documents(query):
# Create embedding for the query
query_embedding = embeddings_model.encode(query)
# Search Pinecone
results = index.query(
vector=query_embedding.tolist(),
top_k=3,
include_metadata=True
)
# Generate answer using FLAN-T5
context = "\n".join([match['metadata']['text'] for match in results['matches']])
prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
outputs = model.generate(
**inputs,
max_length=512,
num_beams=4,
temperature=0.7,
top_p=0.9
)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Format sources
sources = [f"Source: {match['metadata']['file_name']}" for match in results['matches']]
return answer, "\n".join(sources)
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# PDF Document Search and Q&A")
with gr.Tab("Upload Documents"):
file_output = gr.File(
file_count="multiple",
label="Upload PDF Files"
)
upload_button = gr.Button("Process PDFs")
upload_output = gr.Textbox(label="Processing Results")
with gr.Tab("Search and Ask"):
query_input = gr.Textbox(label="Enter your question")
search_button = gr.Button("Search")
answer_output = gr.Textbox(label="Answer")
sources_output = gr.Textbox(label="Sources")
upload_button.click(
process_multiple_pdfs,
inputs=[file_output],
outputs=[upload_output]
)
search_button.click(
search_documents,
inputs=[query_input],
outputs=[answer_output, sources_output]
)
demo.launch() |