import streamlit as st import pdfplumber from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Load the model @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained("OFA-Sys/mineru-base") model = AutoModelForSeq2SeqLM.from_pretrained("OFA-Sys/mineru-base") return tokenizer, model tokenizer, model = load_model() # UI st.title("📄 MinerU: Ask Questions from PDF") uploaded_file = st.file_uploader("Upload a PDF", type="pdf") question = st.text_input("Enter your question:") if uploaded_file and question: with pdfplumber.open(uploaded_file) as pdf: text = '' for page in pdf.pages: text += page.extract_text() # Prepare input for MinerU (usually expects a prompt) input_text = f"question: {question} context: {text[:3000]}" # MinerU has token limit inputs = tokenizer(input_text, return_tensors="pt", truncation=True) outputs = model.generate(**inputs, max_new_tokens=128) answer = tokenizer.decode(outputs[0], skip_special_tokens=True) st.markdown(f"### Answer:\n{answer}")