Satyam0077 commited on
Commit
9b255cc
·
verified ·
1 Parent(s): 213aeb3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ import cohere
4
+ from pinecone import Pinecone
5
+ from sentence_transformers import SentenceTransformer
6
+ import io
7
+
8
+ # Initialize Pinecone and connect to the index
9
+ pc = Pinecone(api_key="0f78bc1b-81f7-4a15-9af3-0fbcf0acdb4e")
10
+ index = pc.Index("quickstart")
11
+
12
+ # Load the sentence transformer model
13
+ model = SentenceTransformer('all-MiniLM-L6-v2')
14
+
15
+ # Initialize Cohere with your API key
16
+ co = cohere.Client("CxIrucBVA8NNJJOBUnxwRWq488MVydBku1DlqP1u")
17
+
18
+ def extract_text_from_pdf(pdf_file):
19
+ """Extracts text from the uploaded PDF, with error handling."""
20
+ try:
21
+ if pdf_file is None:
22
+ return "No file uploaded."
23
+
24
+ # Read the PDF content from the file path
25
+ with open(pdf_file.name, 'rb') as f:
26
+ pdf_reader = PyPDF2.PdfReader(f)
27
+ text = ""
28
+ for page_num in range(len(pdf_reader.pages)):
29
+ text += pdf_reader.pages[page_num].extract_text() or ""
30
+
31
+ if not text.strip():
32
+ return "The uploaded PDF is empty or has no readable content."
33
+ return text
34
+
35
+ except PyPDF2.errors.PdfReadError:
36
+ return "The uploaded PDF is encrypted or unreadable."
37
+ except Exception as e:
38
+ return f"Error reading PDF: {str(e)}"
39
+
40
+ def store_pdf_embeddings(pdf_text):
41
+ """Generate and store embeddings for the uploaded PDF content."""
42
+ segments = [pdf_text[i:i + 512] for i in range(0, len(pdf_text), 512)]
43
+ embeddings = model.encode(segments)
44
+ vectors = [(f"seg-{i}", embed.tolist()) for i, embed in enumerate(embeddings)]
45
+ index.upsert(vectors=vectors)
46
+ return "PDF uploaded and stored successfully!"
47
+
48
+ def ask_question(query):
49
+ """Handle user questions and generate answers based on the PDF content."""
50
+ query_embedding = model.encode(query).tolist()
51
+
52
+ # Retrieve the most relevant segment from Pinecone
53
+ result = index.query(top_k=1, vector=query_embedding)
54
+ retrieved_seg_id = result['matches'][0]['id']
55
+ segment_text = f"Segment: {retrieved_seg_id}"
56
+
57
+ # Generate the answer using the retrieved segment as context
58
+ prompt = f"{segment_text}\nQuestion: {query}\nAnswer:"
59
+ response = co.generate(
60
+ model="command-xlarge-nightly",
61
+ prompt=prompt,
62
+ max_tokens=50
63
+ )
64
+
65
+ # Return both the segment and the answer
66
+ answer = response.generations[0].text.strip()
67
+ return segment_text, answer
68
+
69
+ # Gradio Interface Setup
70
+ with gr.Blocks() as demo:
71
+ gr.Markdown("# Interactive QA Bot with PDF Support")
72
+
73
+ # PDF Upload Section
74
+ pdf_input = gr.File(label="Upload PDF", type="filepath", file_types=[".pdf"])
75
+ upload_status = gr.Textbox(label="Upload Status", interactive=False)
76
+ upload_button = gr.Button("Upload and Store")
77
+
78
+ # Handle PDF Upload
79
+ upload_button.click(
80
+ lambda pdf: store_pdf_embeddings(extract_text_from_pdf(pdf))
81
+ if pdf is not None else "Please upload a valid PDF.",
82
+ inputs=pdf_input, outputs=upload_status
83
+ )
84
+
85
+ # Question and Answer Section
86
+ query_input = gr.Textbox(label="Enter your question")
87
+ segment_output = gr.Textbox(label="Retrieved Segment", interactive=False)
88
+ answer_output = gr.Textbox(label="Answer", interactive=False)
89
+ query_button = gr.Button("Ask")
90
+
91
+ # Handle User Questions
92
+ query_button.click(
93
+ ask_question, inputs=query_input, outputs=[segment_output, answer_output]
94
+ )
95
+
96
+ demo.launch(share=True) # Set share=True if you want a public link