abakerdp commited on
Commit
c016133
·
verified ·
1 Parent(s): 0cded56

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -63
app.py CHANGED
@@ -3,99 +3,125 @@ import pinecone
3
  from sentence_transformers import SentenceTransformer
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqGeneration
5
  import torch
6
- from datasets import load_dataset
 
 
 
7
 
8
- # Initialize models and databases
9
- def init_models():
10
- # Load the embedding model
11
- embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
12
-
13
- # Load the LLM for answering
14
- tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
15
- model = AutoModelForSeq2SeqGeneration.from_pretrained("google/flan-t5-base")
 
 
 
 
 
 
 
16
 
17
- # Initialize Pinecone
18
- pinecone.init(api_key="your-pinecone-api-key", environment="gcp-starter")
19
- index = pinecone.Index("test-index")
 
 
 
 
20
 
21
- # Load your dataset from Hugging Face
22
- dataset = load_dataset("your-username/your-dataset-name", split="train")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- return embeddings_model, tokenizer, model, index, dataset
25
 
26
- # Generate response using retrieved context
27
- def generate_answer(question, context, tokenizer, model):
28
- prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
29
- inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
30
-
31
- outputs = model.generate(
32
- **inputs,
33
- max_length=512,
34
- num_beams=4,
35
- temperature=0.7,
36
- top_p=0.9,
37
- repetition_penalty=1.2,
38
- early_stopping=True
39
- )
40
-
41
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
42
- return answer
43
 
44
- def search_documents(query, embeddings_model, index, dataset, top_k=3):
45
  # Create embedding for the query
46
  query_embedding = embeddings_model.encode(query)
47
 
48
  # Search Pinecone
49
  results = index.query(
50
  vector=query_embedding.tolist(),
51
- top_k=top_k,
52
  include_metadata=True
53
  )
54
 
55
- # Get full context from the dataset using metadata
56
- contexts = []
57
- for match in results.matches:
58
- source = match.metadata['source']
59
- # Find the corresponding document in the dataset
60
- doc = next((item for item in dataset if item['source'] == source), None)
61
- if doc:
62
- contexts.append(doc['text'])
63
 
64
- return "\n\n".join(contexts)
65
-
66
- # Initialize all models and databases
67
- embeddings_model, tokenizer, model, index, dataset = init_models()
68
-
69
- def process_query(query):
70
- # Search for relevant documents
71
- context = search_documents(query, embeddings_model, index, dataset)
72
 
73
- # Generate answer
74
- answer = generate_answer(query, context, tokenizer, model)
 
 
 
 
 
 
 
75
 
76
  # Format sources
77
- sources = [f"Source: {match.metadata['source']}" for match in index.query(
78
- vector=embeddings_model.encode(query).tolist(),
79
- top_k=3,
80
- include_metadata=True
81
- ).matches]
82
 
83
  return answer, "\n".join(sources)
84
 
85
- # Create the Gradio interface
86
  with gr.Blocks() as demo:
87
- gr.Markdown("# Document Search and Q&A")
 
 
 
 
 
 
 
 
88
 
89
- with gr.Row():
90
  query_input = gr.Textbox(label="Enter your question")
91
  search_button = gr.Button("Search")
92
-
93
- with gr.Row():
94
  answer_output = gr.Textbox(label="Answer")
95
  sources_output = gr.Textbox(label="Sources")
96
 
 
 
 
 
 
 
97
  search_button.click(
98
- process_query,
99
  inputs=[query_input],
100
  outputs=[answer_output, sources_output]
101
  )
 
3
  from sentence_transformers import SentenceTransformer
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqGeneration
5
  import torch
6
+ import PyPDF2
7
+ import io
8
+ import os
9
+ from tqdm import tqdm
10
 
11
+ # Initialize models and Pinecone
12
+ embeddings_model = SentenceTransformer('all-MiniLM-L6-v2')
13
+ tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
14
+ model = AutoModelForSeq2SeqGeneration.from_pretrained("google/flan-t5-base")
15
+
16
+ # Initialize Pinecone with environment variable
17
+ PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
18
+ pinecone.init(api_key=PINECONE_API_KEY, environment="gcp-starter")
19
+ index = pinecone.Index("pdf-index")
20
+
21
+ def process_pdf(file):
22
+ # Read PDF content
23
+ pdf_content = file.read()
24
+ pdf_file = io.BytesIO(pdf_content)
25
+ reader = PyPDF2.PdfReader(pdf_file)
26
 
27
+ # Extract text from PDF
28
+ text_chunks = []
29
+ for page in reader.pages:
30
+ text = page.extract_text()
31
+ # Split into smaller chunks (roughly 1000 characters each)
32
+ chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
33
+ text_chunks.extend(chunks)
34
 
35
+ # Create embeddings and upload to Pinecone
36
+ processed_chunks = 0
37
+ for i, chunk in enumerate(text_chunks):
38
+ try:
39
+ # Create embedding
40
+ embedding = embeddings_model.encode(chunk)
41
+
42
+ # Upload to Pinecone
43
+ index.upsert(
44
+ vectors=[(
45
+ f"{file.name}_chunk_{i}",
46
+ embedding.tolist(),
47
+ {
48
+ 'file_name': file.name,
49
+ 'chunk_num': i,
50
+ 'text': chunk
51
+ }
52
+ )]
53
+ )
54
+ processed_chunks += 1
55
+ except Exception as e:
56
+ print(f"Error processing chunk {i}: {str(e)}")
57
 
58
+ return f"Successfully processed {processed_chunks} chunks from {file.name}"
59
 
60
+ def process_multiple_pdfs(files):
61
+ results = []
62
+ for file in files:
63
+ result = process_pdf(file)
64
+ results.append(result)
65
+ return "\n".join(results)
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ def search_documents(query):
68
  # Create embedding for the query
69
  query_embedding = embeddings_model.encode(query)
70
 
71
  # Search Pinecone
72
  results = index.query(
73
  vector=query_embedding.tolist(),
74
+ top_k=3,
75
  include_metadata=True
76
  )
77
 
78
+ # Generate answer using FLAN-T5
79
+ context = "\n".join([match.metadata['text'] for match in results.matches])
 
 
 
 
 
 
80
 
81
+ prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
82
+ inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
 
 
 
 
 
 
83
 
84
+ outputs = model.generate(
85
+ **inputs,
86
+ max_length=512,
87
+ num_beams=4,
88
+ temperature=0.7,
89
+ top_p=0.9
90
+ )
91
+
92
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
93
 
94
  # Format sources
95
+ sources = [f"Source: {match.metadata['file_name']}" for match in results.matches]
 
 
 
 
96
 
97
  return answer, "\n".join(sources)
98
 
99
+ # Create Gradio interface
100
  with gr.Blocks() as demo:
101
+ gr.Markdown("# PDF Document Search and Q&A")
102
+
103
+ with gr.Tab("Upload Documents"):
104
+ file_output = gr.File(
105
+ file_count="multiple",
106
+ label="Upload PDF Files"
107
+ )
108
+ upload_button = gr.Button("Process PDFs")
109
+ upload_output = gr.Textbox(label="Processing Results")
110
 
111
+ with gr.Tab("Search and Ask"):
112
  query_input = gr.Textbox(label="Enter your question")
113
  search_button = gr.Button("Search")
 
 
114
  answer_output = gr.Textbox(label="Answer")
115
  sources_output = gr.Textbox(label="Sources")
116
 
117
+ upload_button.click(
118
+ process_multiple_pdfs,
119
+ inputs=[file_output],
120
+ outputs=[upload_output]
121
+ )
122
+
123
  search_button.click(
124
+ search_documents,
125
  inputs=[query_input],
126
  outputs=[answer_output, sources_output]
127
  )