| import PyPDF2 | |
| import numpy as np | |
| import faiss | |
| from transformers import BertTokenizer, BertForMaskedLM, BertForQuestionAnswering | |
| import torch | |
| tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') | |
| qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| gen_model_id = "distilgpt2" | |
| gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id) | |
| gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id).to("cuda" if torch.cuda.is_available() else "cpu") | |
| def read_pdf(file): | |
| reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def split_text(text, chunk_size=500): | |
| return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
| def encode_text(text): | |
| inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True) | |
| with torch.no_grad(): | |
| outputs = qa_model.bert(**inputs) | |
| return outputs.last_hidden_state.mean(dim=1).numpy().astype(np.float32) | |
| def create_faiss_index(chunks): | |
| embeddings = np.vstack([encode_text(chunk) for chunk in chunks]) | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| index.add(embeddings) | |
| return index, embeddings | |
| def search_faq(query, index, k=3): | |
| query_emb = encode_text(query) | |
| D, I = index.search(query_emb, k) | |
| return I | |
| def generate_distilgpt2_answer(context, question): | |
| prompt = f"Context: {context}\nQuestion: {question}\nAnswer:" | |
| inputs = gen_tokenizer(prompt, return_tensors="pt").to(gen_model.device) | |
| with torch.no_grad(): | |
| outputs = gen_model.generate(**inputs, max_new_tokens=64, | |
| pad_token_id=gen_tokenizer.eos_token_id, | |
| eos_token_id=gen_tokenizer.eos_token_id, | |
| repetition_penalty=1.3) | |
| generated = gen_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return generated.split("Answer:")[-1].strip() | |
| def process_pdf_and_answer(pdf_file, query, top_k=1): | |
| text = read_pdf(pdf_file) | |
| chunks = split_text(text) | |
| faiss_index, _ = create_faiss_index(chunks) | |
| indices = search_faq(query, faiss_index, k=top_k) | |
| answers = [] | |
| for idx in indices[0]: | |
| context = chunks[idx] | |
| answer = generate_distilgpt2_answer(context, query) | |
| answers.append(answer) | |
| return "\n\n---\n\n".join(answers) | |
| import gradio as gr | |
| interface = gr.Interface( | |
| fn=process_pdf_and_answer, | |
| inputs=[ | |
| gr.File(label="Upload PDF"), | |
| gr.Textbox(label="Your Question"), | |
| ], | |
| outputs=gr.Textbox(label="Generated Answer(s)"), | |
| title="📄 PDF Question Answering", | |
| description="Upload a PDF and ask a question about its content. The model will try to answer based on the most relevant chunks.", | |
| ) | |
| interface.launch() |