Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,22 +1,32 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
-
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 5 |
import torch
|
| 6 |
import PyPDF2
|
| 7 |
import io
|
| 8 |
import os
|
| 9 |
from tqdm import tqdm
|
| 10 |
|
| 11 |
-
# Initialize models
|
| 12 |
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 13 |
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
|
| 14 |
-
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
|
| 15 |
|
| 16 |
# Initialize Pinecone with environment variable
|
| 17 |
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def process_pdf(file):
|
| 22 |
# Read PDF content
|
|
@@ -76,7 +86,7 @@ def search_documents(query):
|
|
| 76 |
)
|
| 77 |
|
| 78 |
# Generate answer using FLAN-T5
|
| 79 |
-
context = "\n".join([match
|
| 80 |
|
| 81 |
prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
|
| 82 |
inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
|
|
@@ -92,7 +102,7 @@ def search_documents(query):
|
|
| 92 |
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 93 |
|
| 94 |
# Format sources
|
| 95 |
-
sources = [f"Source: {match
|
| 96 |
|
| 97 |
return answer, "\n".join(sources)
|
| 98 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from pinecone import Pinecone
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 5 |
import torch
|
| 6 |
import PyPDF2
|
| 7 |
import io
|
| 8 |
import os
|
| 9 |
from tqdm import tqdm
|
| 10 |
|
| 11 |
+
# Initialize models
|
| 12 |
embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 13 |
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
|
| 14 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
|
| 15 |
|
| 16 |
# Initialize Pinecone with environment variable
|
| 17 |
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
|
| 18 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
| 19 |
+
|
| 20 |
+
# Create index if it doesn't exist
|
| 21 |
+
if 'pdf-index' not in pc.list_indexes().names():
|
| 22 |
+
pc.create_index(
|
| 23 |
+
name='pdf-index',
|
| 24 |
+
dimension=384, # dimension for 'all-MiniLM-L6-v2'
|
| 25 |
+
metric='cosine'
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Connect to index
|
| 29 |
+
index = pc.Index('pdf-index')
|
| 30 |
|
| 31 |
def process_pdf(file):
|
| 32 |
# Read PDF content
|
|
|
|
| 86 |
)
|
| 87 |
|
| 88 |
# Generate answer using FLAN-T5
|
| 89 |
+
context = "\n".join([match['metadata']['text'] for match in results['matches']])
|
| 90 |
|
| 91 |
prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
|
| 92 |
inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
|
|
|
|
| 102 |
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 103 |
|
| 104 |
# Format sources
|
| 105 |
+
sources = [f"Source: {match['metadata']['file_name']}" for match in results['matches']]
|
| 106 |
|
| 107 |
return answer, "\n".join(sources)
|
| 108 |
|