File size: 4,814 Bytes
a3f8edb
e5cb061
 
 
a3f8edb
 
ed1f8e5
80f0ff1
ed1f8e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5cb061
 
 
e6d25ac
a3f8edb
e5cb061
 
a3f8edb
e5cb061
 
 
 
 
a3f8edb
e5cb061
 
 
80f0ff1
 
 
 
9828de9
 
 
80f0ff1
2828f4b
a3f8edb
e5cb061
a3f8edb
 
e5cb061
a3f8edb
2828f4b
 
 
 
 
e5cb061
a3f8edb
 
ed1f8e5
 
 
 
 
e5cb061
a3f8edb
ed1f8e5
dd15c57
 
ed1f8e5
2828f4b
a3f8edb
e5cb061
 
38f6acd
e5cb061
a3f8edb
e5cb061
 
 
 
a3f8edb
e5cb061
 
 
 
 
 
 
a3f8edb
e5cb061
 
 
a3f8edb
e6d25ac
 
 
 
 
 
 
 
c7f6187
e6d25ac
 
 
 
 
 
e5cb061
 
 
a3f8edb
e5cb061
a3f8edb
e5cb061
a3f8edb
e5cb061
47c91af
e5cb061
 
 
 
 
 
 
 
 
 
a3f8edb
ed1f8e5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import gradio as gr
import faiss
import pickle
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from huggingface_hub import InferenceClient, HfApi
import pdfplumber

# Hugging Face Space persistence
HF_REPO_ID = "MoslemBot/kajibuku"  # e.g., "username/your-space-name"
HF_API_TOKEN = os.getenv("HF_TOKEN")
api = HfApi()

def upload_to_hub(local_path, remote_path):
    api.upload_file(
        path_or_fileobj=local_path,
        path_in_repo=remote_path,
        repo_id=HF_REPO_ID,
        repo_type="space",
        token=HF_API_TOKEN
    )
    print(f"βœ… Uploaded to Hub: {remote_path}")

# Initialize embedder and LLM client
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
llm = InferenceClient(token=os.getenv("HF_TOKEN"))

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)

# Save uploaded PDF and index its content
def save_pdf(file, title):
    folder = os.path.join(DATA_DIR, title.strip())
    if os.path.exists(folder):
        return f"'{title}' already exists. Use a different title."

    os.makedirs(folder, exist_ok=True)

    # Extract text
    # reader = PdfReader(file.name)
    # full_text = "\n".join(p.extract_text() for p in reader.pages if p.extract_text())

    with pdfplumber.open(file.name) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"
        
    print(full_text)

    # Chunk text
    chunks = [full_text[i:i+500] for i in range(0, len(full_text), 500)]

    # Embed and index
    embeddings = embedder.encode(chunks)
    
    print("Embeddings shape:", embeddings.shape)
    if len(embeddings.shape) != 2:
        raise ValueError(f"Expected 2D embeddings, got shape {embeddings.shape}")
        
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    # Save index and chunks locally
    index_path = os.path.join(folder, "index.faiss")
    chunks_path = os.path.join(folder, "chunks.pkl")
    faiss.write_index(index, index_path)
    with open(chunks_path, "wb") as f:
        pickle.dump(chunks, f)

    # Upload to hub
    upload_to_hub(index_path, f"data/{title}/index.faiss")
    upload_to_hub(chunks_path, f"data/{title}/chunks.pkl")

    return f"βœ… Saved and indexed '{title}', and uploaded to Hub. Please reload (refresh) the page."

# Return all available PDF titles
def list_titles():
    print(f"Listing in: {DATA_DIR} β†’ {os.listdir(DATA_DIR)}")
    return [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]

# Ask question using selected PDFs as context
def ask_question(message, history, selected_titles):
    if not selected_titles:
        return "❗ Please select at least one PDF."

    combined_answer = ""
    for title in selected_titles:
        folder = os.path.join(DATA_DIR, title)
        try:
            index = faiss.read_index(os.path.join(folder, "index.faiss"))
            with open(os.path.join(folder, "chunks.pkl"), "rb") as f:
                chunks = pickle.load(f)

            q_embed = embedder.encode([message])
            D, I = index.search(q_embed, k=3)
            context = "\n".join([chunks[i] for i in I[0]])

            #prompt = f"Context:\n{context}\n\nQuestion: {message}\nAnswer:"
            #print(prompt)
            response = llm.chat_completion(
                messages=[
                    {"role": "system", "content": "You are a helpful assistant. Answer based only on the given context."},
                    {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {message}"}
                ],
                model="deepseek-ai/DeepSeek-R1-0528",
                max_tokens=2048,
            )
            
            response = response.choices[0].message["content"]
  
            #response = llm.text_generation(prompt, max_new_tokens=200)
            #print(response)
            combined_answer += f"**{title}**:\n{response.strip()}\n\n"
        except Exception as e:
            combined_answer += f"⚠️ Error with {title}: {str(e)}\n\n"

    return combined_answer.strip()

# Gradio UI
with gr.Blocks() as demo:
    with gr.Tab("πŸ“€ Upload PDF"):
        file = gr.File(label="PDF File", file_types=[".pdf"])
        title = gr.Textbox(label="Title for PDF")
        upload_btn = gr.Button("Upload and Index")
        upload_status = gr.Textbox(label="Status")
        upload_btn.click(fn=save_pdf, inputs=[file, title], outputs=upload_status)

    with gr.Tab("πŸ’¬ Chat with PDFs"):
        pdf_selector = gr.CheckboxGroup(label="Select PDFs", choices=list_titles())
        refresh_btn = gr.Button("πŸ”„ Refresh PDF List")
        refresh_btn.click(fn=list_titles, outputs=pdf_selector)
        chat = gr.ChatInterface(fn=ask_question, additional_inputs=[pdf_selector])

demo.launch()