abakerdp commited on
Commit
ff894bd
·
verified ·
1 Parent(s): 91770df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -8
app.py CHANGED
@@ -1,22 +1,32 @@
1
  import gradio as gr
2
- import pinecone
3
  from sentence_transformers import SentenceTransformer
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Fixed this line
5
  import torch
6
  import PyPDF2
7
  import io
8
  import os
9
  from tqdm import tqdm
10
 
11
- # Initialize models and Pinecone
12
  embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
13
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
14
- model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base") # And this line
15
 
16
  # Initialize Pinecone with environment variable
17
  PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
18
- pinecone.init(api_key=PINECONE_API_KEY, environment="gcp-starter")
19
- index = pinecone.Index("pdf-index")
 
 
 
 
 
 
 
 
 
 
20
 
21
  def process_pdf(file):
22
  # Read PDF content
@@ -76,7 +86,7 @@ def search_documents(query):
76
  )
77
 
78
  # Generate answer using FLAN-T5
79
- context = "\n".join([match.metadata['text'] for match in results.matches])
80
 
81
  prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
82
  inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
@@ -92,7 +102,7 @@ def search_documents(query):
92
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
93
 
94
  # Format sources
95
- sources = [f"Source: {match.metadata['file_name']}" for match in results.matches]
96
 
97
  return answer, "\n".join(sources)
98
 
 
1
  import gradio as gr
2
+ from pinecone import Pinecone
3
  from sentence_transformers import SentenceTransformer
4
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  import torch
6
  import PyPDF2
7
  import io
8
  import os
9
  from tqdm import tqdm
10
 
11
+ # Initialize models
12
  embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
13
  tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
14
+ model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
15
 
16
  # Initialize Pinecone with environment variable
17
  PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
18
+ pc = Pinecone(api_key=PINECONE_API_KEY)
19
+
20
+ # Create index if it doesn't exist
21
+ if 'pdf-index' not in pc.list_indexes().names():
22
+ pc.create_index(
23
+ name='pdf-index',
24
+ dimension=384, # dimension for 'all-MiniLM-L6-v2'
25
+ metric='cosine'
26
+ )
27
+
28
+ # Connect to index
29
+ index = pc.Index('pdf-index')
30
 
31
  def process_pdf(file):
32
  # Read PDF content
 
86
  )
87
 
88
  # Generate answer using FLAN-T5
89
+ context = "\n".join([match['metadata']['text'] for match in results['matches']])
90
 
91
  prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
92
  inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
 
102
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
103
 
104
  # Format sources
105
+ sources = [f"Source: {match['metadata']['file_name']}" for match in results['matches']]
106
 
107
  return answer, "\n".join(sources)
108