import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer from langchain.prompts import PromptTemplate from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader from langchain.chains.summarize import load_summarize_chain from langchain.llms import HuggingFacePipeline import fitz # PyMuPDF for PDF from docx import Document # Load Phi-2 model and tokenizer device = "cpu" model_name = "microsoft/phi-2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) model.to(device) def load_document(file_path): """Loads document and extracts text using appropriate loader.""" if file_path.endswith(".pdf"): loader = PyPDFLoader(file_path) elif file_path.endswith(".docx"): loader = Docx2txtLoader(file_path) elif file_path.endswith(".txt"): loader = TextLoader(file_path) else: return "Unsupported file format." docs = loader.load() return docs def summarize_document(file): """Summarizes the extracted text using Phi-2 and MapReduce.""" docs = load_document(file.name) if not docs: return "No text found in document." # Split text into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) split_docs = text_splitter.split_documents(docs) # Load summarization chain with MapReduce llm = HuggingFacePipeline(pipeline=model) summarize_chain = load_summarize_chain(llm, chain_type="map_reduce") summary = summarize_chain.run(split_docs) return summary # Gradio Interface demo = gr.Interface( fn=summarize_document, inputs=gr.File(label="Upload Document (PDF, DOCX, TXT)"), outputs=gr.Textbox(label="Summarized Text"), title="AI Document Summarizer with Phi-2", description="Upload a document, and the AI will generate a summary using MapReduce." ) if __name__ == "__main__": demo.launch()