prithvi1029 commited on
Commit
d26337d
Β·
verified Β·
1 Parent(s): e2117cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -43
app.py CHANGED
@@ -1,67 +1,163 @@
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- from langchain_community.document_loaders import PyPDFLoader
4
- from langchain_text_splitters import RecursiveCharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
- from langchain_community.vectorstores import FAISS
7
- from langchain_huggingface import HuggingFaceEndpoint
8
 
 
 
 
 
9
 
10
- def run_qa(pdf_path, question):
11
- if pdf_path is None or not question.strip():
12
- return "Please upload a PDF and enter a question."
13
 
14
- # Load PDF
15
- loader = PyPDFLoader(pdf_path)
16
- docs = loader.load()
17
 
18
- # Split
19
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
20
- chunks = splitter.split_documents(docs)
21
 
22
- # Embeddings
23
- embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
24
- vectordb = FAISS.from_documents(chunks, embeddings)
25
 
26
- # Retrieve
27
- retriever = vectordb.as_retriever(search_kwargs={"k": 4})
28
- retrieved_docs = retriever.get_relevant_documents(question)
29
- context = "\n\n".join([d.page_content for d in retrieved_docs])
 
30
 
31
- # Hugging Face LLM
32
- llm = HuggingFaceEndpoint(
33
- repo_id="mistralai/Mistral-7B-Instruct-v0.2",
34
- temperature=0.2,
35
- max_new_tokens=512,
36
- )
 
 
 
 
 
37
 
38
- prompt = f"""
39
- You are a helpful assistant. Answer ONLY using the context.
40
- If the answer is not present, say "I don't know".
41
 
42
  Context:
43
  {context}
44
 
45
- Question:
46
- {question}
47
 
48
- Answer:
49
- """
50
 
51
- answer = llm.invoke(prompt)
52
 
53
- sources = "\n\n".join([d.page_content[:500] for d in retrieved_docs[:2]])
54
 
55
- return f"### Answer\n{answer}\n\n---\n### Sources\n{sources}"
56
 
57
-
58
- with gr.Blocks(title="Agentic Document Intelligence") as demo:
59
- gr.Markdown("# πŸ“„ Agentic Document Intelligence (HF LLM)")
 
 
60
 
61
  pdf = gr.File(label="Upload PDF", type="filepath")
62
- question = gr.Textbox(label="Ask a question")
63
- output = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- gr.Button("Run").click(run_qa, inputs=[pdf, question], outputs=output)
 
 
 
 
66
 
67
  demo.launch()
 
1
+ import os
2
+ import re
3
  import gradio as gr
4
+ import faiss
5
+ import numpy as np
6
+
7
+ from pypdf import PdfReader
8
+ from sentence_transformers import SentenceTransformer
9
+ from huggingface_hub import InferenceClient
10
+
11
+
12
+ # -----------------------------
13
+ # Config
14
+ # -----------------------------
15
+ HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
16
+ # Pick a model that works with Inference API (you can change this)
17
+ HF_LLM_MODEL = os.getenv("HF_LLM_MODEL", "mistralai/Mistral-7B-Instruct-v0.3")
18
+
19
+ EMBED_MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
20
+ TOP_K = 4
21
+
22
+
23
+ # -----------------------------
24
+ # Helpers
25
+ # -----------------------------
26
+ def clean_text(s: str) -> str:
27
+ s = re.sub(r"\s+", " ", s)
28
+ return s.strip()
29
+
30
+ def chunk_text(text: str, chunk_size=900, overlap=150):
31
+ chunks = []
32
+ start = 0
33
+ n = len(text)
34
+ while start < n:
35
+ end = min(n, start + chunk_size)
36
+ chunks.append(text[start:end])
37
+ start = end - overlap
38
+ if start < 0:
39
+ start = 0
40
+ if end == n:
41
+ break
42
+ return [c for c in (clean_text(x) for x in chunks) if len(c) > 30]
43
+
44
+ def pdf_to_text(pdf_path: str) -> str:
45
+ reader = PdfReader(pdf_path)
46
+ pages = []
47
+ for p in reader.pages:
48
+ t = p.extract_text() or ""
49
+ if t.strip():
50
+ pages.append(t)
51
+ return "\n".join(pages)
52
+
53
+ def build_faiss_index(chunks, embedder):
54
+ vectors = embedder.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)
55
+ dim = vectors.shape[1]
56
+ index = faiss.IndexFlatIP(dim) # cosine similarity since normalized
57
+ index.add(vectors.astype(np.float32))
58
+ return index, vectors
59
+
60
+ def retrieve(query, embedder, index, chunks, k=TOP_K):
61
+ qv = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
62
+ scores, ids = index.search(qv, k)
63
+ hits = []
64
+ for score, idx in zip(scores[0], ids[0]):
65
+ if idx == -1:
66
+ continue
67
+ hits.append((float(score), chunks[int(idx)]))
68
+ return hits
69
+
70
+ def hf_generate(client: InferenceClient, prompt: str) -> str:
71
+ # Works with many chat/instruct models using "text_generation"
72
+ out = client.text_generation(
73
+ prompt,
74
+ max_new_tokens=450,
75
+ temperature=0.2,
76
+ top_p=0.9,
77
+ repetition_penalty=1.08,
78
+ )
79
+ return out.strip()
80
 
 
 
 
 
 
81
 
82
+ # -----------------------------
83
+ # App logic (cached state)
84
+ # -----------------------------
85
+ embedder = SentenceTransformer(EMBED_MODEL_NAME)
86
 
87
+ def on_upload(pdf_path):
88
+ if not pdf_path:
89
+ return None, None, "Please upload a PDF."
90
 
91
+ text = pdf_to_text(pdf_path)
92
+ if not text.strip():
93
+ return None, None, "Could not extract text from this PDF (it may be scanned). Try a text-based PDF."
94
 
95
+ chunks = chunk_text(text)
96
+ if len(chunks) < 2:
97
+ return None, None, "Not enough extractable text to build RAG index."
98
 
99
+ index, _ = build_faiss_index(chunks, embedder)
100
+ return index, chunks, f"βœ… Indexed {len(chunks)} chunks. Now ask a question."
 
101
 
102
+ def answer_question(index, chunks, question):
103
+ if index is None or chunks is None:
104
+ return "Upload a PDF first."
105
+ if not question or not question.strip():
106
+ return "Type a question."
107
 
108
+ if not HF_TOKEN:
109
+ return (
110
+ "HF token not found. Go to Space β†’ Settings β†’ Variables and secrets β†’ "
111
+ "add Secret named HUGGINGFACEHUB_API_TOKEN, then Restart Space."
112
+ )
113
+
114
+ hits = retrieve(question, embedder, index, chunks, k=TOP_K)
115
+ context = "\n\n".join([f"[{i+1}] {h[1]}" for i, h in enumerate(hits)])
116
+
117
+ prompt = f"""You are a helpful assistant. Answer using ONLY the context.
118
+ If the answer is not in the context, say "I don't know from the provided document."
119
 
120
+ Question: {question}
 
 
121
 
122
  Context:
123
  {context}
124
 
125
+ Answer:"""
 
126
 
127
+ client = InferenceClient(model=HF_LLM_MODEL, token=HF_TOKEN)
128
+ ans = hf_generate(client, prompt)
129
 
130
+ sources = "\n\n".join([f"**Source {i+1} (score={hits[i][0]:.3f})**\n{hits[i][1][:600]}..." for i in range(len(hits))])
131
 
132
+ return f"### Answer\n{ans}\n\n---\n### Retrieved Sources\n{sources}"
133
 
 
134
 
135
+ # -----------------------------
136
+ # UI
137
+ # -----------------------------
138
+ with gr.Blocks(title="Agentic Document Intelligence (HF RAG)") as demo:
139
+ gr.Markdown("# πŸ“„ Agentic Document Intelligence\nUpload a PDF and ask questions (RAG) β€” using Hugging Face Inference API.")
140
 
141
  pdf = gr.File(label="Upload PDF", type="filepath")
142
+ status = gr.Markdown()
143
+
144
+ index_state = gr.State(None)
145
+ chunks_state = gr.State(None)
146
+
147
+ pdf.change(
148
+ fn=on_upload,
149
+ inputs=[pdf],
150
+ outputs=[index_state, chunks_state, status],
151
+ )
152
+
153
+ question = gr.Textbox(label="Ask a question", placeholder="e.g., What is the payment term?")
154
+ out = gr.Markdown()
155
+ btn = gr.Button("Run")
156
 
157
+ btn.click(
158
+ fn=answer_question,
159
+ inputs=[index_state, chunks_state, question],
160
+ outputs=[out],
161
+ )
162
 
163
  demo.launch()