| import fitz | |
| import gradio as gr | |
| import re | |
| from transformers import pipeline | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| qa_model = pipeline("question-answering", model="deepset/bert-large-uncased-whole-word-masking-squad2") | |
| def extract_text_from_pdf(pdf_file): | |
| with fitz.open(pdf_file) as pdf: | |
| text = "" | |
| for page in pdf: | |
| text += page.get_text("text") | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def summarize(text): | |
| if len(text) > 1000: | |
| chunks = [text[i:i+1000] for i in range(0, len(text), 1000)] | |
| summary = "" | |
| for chunk in chunks: | |
| summary += summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] + " " | |
| else: | |
| summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] | |
| return summary | |
| def answer_question(text, question): | |
| response = qa_model(question=question, context=text) | |
| answer = response['answer'] | |
| return answer | |
| def summarize_and_qa(pdf_file, question): | |
| text = extract_text_from_pdf(pdf_file) | |
| summary = summarize(text) | |
| answer = answer_question(text, question) | |
| return summary, answer | |
| gr.Interface( | |
| fn=summarize_and_qa, | |
| inputs=["file", "text"], | |
| outputs=["textbox", "textbox"], | |
| title="Understand your PDF Better", | |
| description="Upload a PDF to get a summary. You can ask any question regarding the content of the PDF." | |
| ).launch(debug=True, share=True) | |