Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import gradio as gr | |
| from transformers import pipeline | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| # Initialize summarizer pipeline | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| # Initialize embedding model | |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # Initialize question-answering pipeline | |
| qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad") | |
| def extract_text_from_pdf(file_path): | |
| doc = fitz.open(file_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def chunk_text(text, max_chunk_size=500): | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| current_chunk.append(word) | |
| current_length += len(word) + 1 # +1 for space | |
| if current_length >= max_chunk_size: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [] | |
| current_length = 0 | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| def build_faiss_index(chunks): | |
| embeddings = embedding_model.encode(chunks) | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(np.array(embeddings)) | |
| return index, embeddings | |
| def retrieve_relevant_chunks(query, chunks, index, embeddings, top_k=3): | |
| query_embedding = embedding_model.encode([query]) | |
| distances, indices = index.search(np.array(query_embedding), top_k) | |
| retrieved_chunks = [chunks[i] for i in indices[0]] | |
| return retrieved_chunks | |
| def summarize_pdf(file_path): | |
| raw_text = extract_text_from_pdf(file_path) | |
| max_chunk = 1024 | |
| chunks = [raw_text[i:i+max_chunk] for i in range(0, len(raw_text), max_chunk)] | |
| summary = "" | |
| for chunk in chunks: | |
| res = summarizer(chunk, max_length=130, min_length=30, do_sample=False) | |
| summary += res[0]['summary_text'] + " " | |
| return summary.strip() | |
| def answer_question(file_path, question): | |
| raw_text = extract_text_from_pdf(file_path) | |
| chunks = chunk_text(raw_text) | |
| index, embeddings = build_faiss_index(chunks) | |
| relevant_chunks = retrieve_relevant_chunks(question, chunks, index, embeddings) | |
| context = " ".join(relevant_chunks) | |
| answer = qa_pipeline(question=question, context=context) | |
| return answer['answer'] | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# PDF Summarizer and Q&A") | |
| with gr.Tab("Summarization"): | |
| with gr.Row(): | |
| pdf_input = gr.File(type="filepath", label="Upload a PDF") | |
| summarize_button = gr.Button("Summarize") | |
| summary_output = gr.Textbox(label="Summary", lines=10) | |
| summarize_button.click(fn=summarize_pdf, inputs=pdf_input, outputs=summary_output) | |
| with gr.Tab("Question Answering"): | |
| with gr.Row(): | |
| pdf_input_qa = gr.File(type="filepath", label="Upload a PDF") | |
| question_input = gr.Textbox(label="Enter your question") | |
| answer_button = gr.Button("Get Answer") | |
| answer_output = gr.Textbox(label="Answer", lines=2) | |
| answer_button.click(fn=answer_question, inputs=[pdf_input_qa, question_input], outputs=answer_output) | |
| if __name__ == "__main__": | |
| demo.launch() | |