File size: 4,814 Bytes
a3f8edb e5cb061 a3f8edb ed1f8e5 80f0ff1 ed1f8e5 e5cb061 e6d25ac a3f8edb e5cb061 a3f8edb e5cb061 a3f8edb e5cb061 80f0ff1 9828de9 80f0ff1 2828f4b a3f8edb e5cb061 a3f8edb e5cb061 a3f8edb 2828f4b e5cb061 a3f8edb ed1f8e5 e5cb061 a3f8edb ed1f8e5 dd15c57 ed1f8e5 2828f4b a3f8edb e5cb061 38f6acd e5cb061 a3f8edb e5cb061 a3f8edb e5cb061 a3f8edb e5cb061 a3f8edb e6d25ac c7f6187 e6d25ac e5cb061 a3f8edb e5cb061 a3f8edb e5cb061 a3f8edb e5cb061 47c91af e5cb061 a3f8edb ed1f8e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
import gradio as gr
import faiss
import pickle
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from huggingface_hub import InferenceClient, HfApi
import pdfplumber
# Hugging Face Space persistence
HF_REPO_ID = "MoslemBot/kajibuku" # e.g., "username/your-space-name"
HF_API_TOKEN = os.getenv("HF_TOKEN")
api = HfApi()
def upload_to_hub(local_path, remote_path):
api.upload_file(
path_or_fileobj=local_path,
path_in_repo=remote_path,
repo_id=HF_REPO_ID,
repo_type="space",
token=HF_API_TOKEN
)
print(f"β
Uploaded to Hub: {remote_path}")
# Initialize embedder and LLM client
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
llm = InferenceClient(token=os.getenv("HF_TOKEN"))
DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)
# Save uploaded PDF and index its content
def save_pdf(file, title):
folder = os.path.join(DATA_DIR, title.strip())
if os.path.exists(folder):
return f"'{title}' already exists. Use a different title."
os.makedirs(folder, exist_ok=True)
# Extract text
# reader = PdfReader(file.name)
# full_text = "\n".join(p.extract_text() for p in reader.pages if p.extract_text())
with pdfplumber.open(file.name) as pdf:
full_text = ""
for page in pdf.pages:
full_text += page.extract_text() + "\n"
print(full_text)
# Chunk text
chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]
# Embed and index
embeddings = embedder.encode(chunks)
print("Embeddings shape:", embeddings.shape)
if len(embeddings.shape) != 2:
raise ValueError(f"Expected 2D embeddings, got shape {embeddings.shape}")
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
# Save index and chunks locally
index_path = os.path.join(folder, "index.faiss")
chunks_path = os.path.join(folder, "chunks.pkl")
faiss.write_index(index, index_path)
with open(chunks_path, "wb") as f:
pickle.dump(chunks, f)
# Upload to hub
upload_to_hub(index_path, f"data/{title}/index.faiss")
upload_to_hub(chunks_path, f"data/{title}/chunks.pkl")
return f"β
Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."
# Return all available PDF titles
def list_titles():
print(f"Listing in: {DATA_DIR} β {os.listdir(DATA_DIR)}")
return [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
# Ask question using selected PDFs as context
def ask_question(message, history, selected_titles):
if not selected_titles:
return "β Please select at least one PDF."
combined_answer = ""
for title in selected_titles:
folder = os.path.join(DATA_DIR, title)
try:
index = faiss.read_index(os.path.join(folder, "index.faiss"))
with open(os.path.join(folder, "chunks.pkl"), "rb") as f:
chunks = pickle.load(f)
q_embed = embedder.encode([message])
D, I = index.search(q_embed, k=3)
context = "\n".join([chunks[i] for i in I[0]])
#prompt = f"Context:\n{context}\n\nQuestion: {message}\nAnswer:"
#print(prompt)
response = llm.chat_completion(
messages=[
{"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {message}"}
],
model="deepseek-ai/DeepSeek-R1-0528",
max_tokens=2048,
)
response = response.choices[0].message["content"]
#response = llm.text_generation(prompt, max_new_tokens=200)
#print(response)
combined_answer += f"**{title}**:\n{response.strip()}\n\n"
except Exception as e:
combined_answer += f"β οΈ Error with {title}: {str(e)}\n\n"
return combined_answer.strip()
# Gradio UI
with gr.Blocks() as demo:
with gr.Tab("π€ Upload PDF"):
file = gr.File(label="PDF File", file_types=[".pdf"])
title = gr.Textbox(label="Title for PDF")
upload_btn = gr.Button("Upload and Index")
upload_status = gr.Textbox(label="Status")
upload_btn.click(fn=save_pdf, inputs=[file, title], outputs=upload_status)
with gr.Tab("π¬ Chat with PDFs"):
pdf_selector = gr.CheckboxGroup(label="Select PDFs", choices=list_titles())
refresh_btn = gr.Button("π Refresh PDF List")
refresh_btn.click(fn=list_titles, outputs=pdf_selector)
chat = gr.ChatInterface(fn=ask_question, additional_inputs=[pdf_selector])
demo.launch() |