Bofandra commited on
Commit
e5cb061
·
verified ·
1 Parent(s): a3f8edb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -44
app.py CHANGED
@@ -1,68 +1,87 @@
1
  import os
 
 
 
2
  from PyPDF2 import PdfReader
3
  from sentence_transformers import SentenceTransformer
4
- import faiss
5
- import torch
6
- from transformers import pipeline
7
- import gradio as gr
 
8
 
9
- # Load models
10
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
11
- model_name = "mistralai/Mistral-7B-Instruct-v0.2" # Replace with your preferred HF model
12
- generator = pipeline("text-generation", model=model_name, device=0 if torch.cuda.is_available() else -1)
13
 
14
- # Globals
15
- texts = []
16
- index = None
 
 
17
 
18
- def process_pdf(file):
19
- global texts, index
 
20
  reader = PdfReader(file.name)
21
- full_text = ""
22
- for page in reader.pages:
23
- full_text += page.extract_text() + "\n"
24
 
 
25
  chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
26
- texts = chunks
27
 
 
28
  embeddings = embedder.encode(chunks)
29
- index = faiss.IndexFlatL2(len(embeddings[0]))
30
  index.add(embeddings)
31
 
32
- return "PDF processed. Ask me anything about it!"
 
 
 
33
 
34
- def chat_fn(message, history):
35
- if index is None or not texts:
36
- return "Please upload and process a PDF first."
37
 
38
- q_embedding = embedder.encode([message])
39
- D, I = index.search(q_embedding, k=3)
40
- context = "\n".join([texts[i] for i in I[0]])
41
 
42
- prompt = f"""You are a helpful assistant. Use the context to answer the question.
 
 
 
43
 
44
- Context:
45
- {context}
 
 
 
 
 
46
 
47
- Question:
48
- {message}
 
49
 
50
- Answer:"""
 
 
 
 
51
 
52
- output = generator(prompt, max_new_tokens=300, do_sample=True)[0]["generated_text"]
53
- answer = output.split("Answer:")[-1].strip()
54
- return answer
55
 
 
56
  with gr.Blocks() as demo:
57
- gr.Markdown("## 🧠 PDF ChatBot - Ask Anything from Your Document")
58
-
59
- with gr.Row():
60
- file = gr.File(file_types=[".pdf"], label="Upload PDF")
61
- status = gr.Textbox(label="Status", interactive=False)
62
- upload_btn = gr.Button("Process PDF")
63
-
64
- upload_btn.click(fn=process_pdf, inputs=file, outputs=status)
65
-
66
- chatbot = gr.ChatInterface(chat_fn)
 
 
67
 
68
  demo.launch()
 
1
  import os
2
+ import gradio as gr
3
+ import faiss
4
+ import pickle
5
  from PyPDF2 import PdfReader
6
  from sentence_transformers import SentenceTransformer
7
+ from huggingface_hub import InferenceClient
8
+
9
+ # Initialize embedder and LLM client
10
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
11
+ llm = InferenceClient("google/gemma-7b-it", token=os.getenv("HF_TOKEN")) # Or any other model you prefer
12
 
13
+ DATA_DIR = "data"
14
+ os.makedirs(DATA_DIR, exist_ok=True)
 
 
15
 
16
+ # Save uploaded PDF and index its content
17
+ def save_pdf(file, title):
18
+ folder = os.path.join(DATA_DIR, title.strip())
19
+ if os.path.exists(folder):
20
+ return f"'{title}' already exists. Use a different title."
21
 
22
+ os.makedirs(folder, exist_ok=True)
23
+
24
+ # Extract text
25
  reader = PdfReader(file.name)
26
+ full_text = "\n".join(p.extract_text() for p in reader.pages if p.extract_text())
 
 
27
 
28
+ # Chunk text
29
  chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
 
30
 
31
+ # Embed and index
32
  embeddings = embedder.encode(chunks)
33
+ index = faiss.IndexFlatL2(embeddings.shape[1])
34
  index.add(embeddings)
35
 
36
+ # Save index and chunks
37
+ faiss.write_index(index, os.path.join(folder, "index.faiss"))
38
+ with open(os.path.join(folder, "chunks.pkl"), "wb") as f:
39
+ pickle.dump(chunks, f)
40
 
41
+ return f"Saved and indexed '{title}'."
 
 
42
 
43
+ # Return all available PDF titles
44
+ def list_titles():
45
+ return [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
46
 
47
+ # Ask question using selected PDFs as context
48
+ def ask_question(message, history, selected_titles):
49
+ if not selected_titles:
50
+ return "❗ Please select at least one PDF."
51
 
52
+ combined_answer = ""
53
+ for title in selected_titles:
54
+ folder = os.path.join(DATA_DIR, title)
55
+ try:
56
+ index = faiss.read_index(os.path.join(folder, "index.faiss"))
57
+ with open(os.path.join(folder, "chunks.pkl"), "rb") as f:
58
+ chunks = pickle.load(f)
59
 
60
+ q_embed = embedder.encode([message])
61
+ D, I = index.search(q_embed, k=3)
62
+ context = "\n".join([chunks[i] for i in I[0]])
63
 
64
+ prompt = f"Context:\n{context}\n\nQuestion: {message}\nAnswer:"
65
+ response = llm.text_generation(prompt, max_new_tokens=200)
66
+ combined_answer += f"**{title}**:\n{response.strip()}\n\n"
67
+ except Exception as e:
68
+ combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"
69
 
70
+ return combined_answer.strip()
 
 
71
 
72
+ # Gradio UI
73
  with gr.Blocks() as demo:
74
+ with gr.Tab("📤 Upload PDF"):
75
+ file = gr.File(label="PDF File")
76
+ title = gr.Textbox(label="Title for PDF")
77
+ upload_btn = gr.Button("Upload and Index")
78
+ upload_status = gr.Textbox(label="Status")
79
+ upload_btn.click(fn=save_pdf, inputs=[file, title], outputs=upload_status)
80
+
81
+ with gr.Tab("💬 Chat with PDFs"):
82
+ pdf_selector = gr.CheckboxGroup(label="Select PDFs", choices=list_titles())
83
+ refresh_btn = gr.Button("🔄 Refresh PDF List")
84
+ refresh_btn.click(fn=list_titles, outputs=pdf_selector)
85
+ chat = gr.ChatInterface(fn=ask_question, additional_inputs=[pdf_selector])
86
 
87
  demo.launch()