File size: 2,722 Bytes
f16866b 38fac1e 5a8a128 f16866b 38fac1e 5a8a128 7b952f4 7821233 7b952f4 905af90 7b952f4 905af90 7b952f4 905af90 1a440a2 905af90 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
from huggingface_hub import HfApi
import os
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
api = HfApi(token=hf_token) # Safe: uses token without saving
import faiss
import pickle
import numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel, pipeline
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval().to("cpu")
qa_pipeline = pipeline(
"text2text-generation",
model="google/flan-t5-small",
tokenizer="google/flan-t5-small",
device=-1 # CPU
)
def embed_text_with_clip(text):
inputs = clip_processor(text=[text], return_tensors="pt", padding=True)
with torch.no_grad():
vec = clip_model.get_text_features(**inputs)[0].numpy()
vec /= np.linalg.norm(vec)
return vec.astype("float32")
def search_similar_pages(question, top_k=5):
if not os.path.exists("vision_index.faiss") or not os.path.exists("metadata.pkl"):
raise FileNotFoundError("PDF not processed yet.")
index = faiss.read_index("vision_index.faiss")
with open("metadata.pkl", "rb") as f:
metadata = pickle.load(f)
query_vec = embed_text_with_clip(question)
distances, indices = index.search(np.array([query_vec]), top_k)
top_pages = [metadata[i] for i in indices[0]]
return top_pages
def ask_local_model(context, question):
prompt = f"Based only on the following text:\n\n{context}\n\nAnswer this question:\n{question}\n\nOnly answer from the text. If unsure, say 'Not found in text.'"
result = qa_pipeline(prompt, max_new_tokens=128, do_sample=False)
return result[0]["generated_text"]
def generate_answer(question):
top_pages = search_similar_pages(question)
answers = []
seen_pages = set()
for page in top_pages:
if page["page"] in seen_pages:
continue
seen_pages.add(page["page"])
answer = ask_local_model(page["ocr"], question)
answers.append({
"page": page["page"],
"thumbnail": page["thumbnail"],
"answer": answer
})
return answers
def summarize_document():
if not os.path.exists("metadata.pkl"):
return "❗ Document not processed yet. Please upload and process a PDF."
with open("metadata.pkl", "rb") as f:
metadata = pickle.load(f)
full_text = " ".join(page["ocr"] for page in metadata).strip()
short_text = full_text[:3000] # Avoid token limit
prompt = f"describe what is the document about and what it contains in 70-80 words:\n\n{short_text}"
result = qa_pipeline(prompt, max_new_tokens=200, do_sample=False)
return result[0]["generated_text"]
|