Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pdfplumber | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| # Load the model | |
| def load_model(): | |
| tokenizer = AutoTokenizer.from_pretrained("OFA-Sys/mineru-base") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("OFA-Sys/mineru-base") | |
| return tokenizer, model | |
| tokenizer, model = load_model() | |
| # UI | |
| st.title("📄 MinerU: Ask Questions from PDF") | |
| uploaded_file = st.file_uploader("Upload a PDF", type="pdf") | |
| question = st.text_input("Enter your question:") | |
| if uploaded_file and question: | |
| with pdfplumber.open(uploaded_file) as pdf: | |
| text = '' | |
| for page in pdf.pages: | |
| text += page.extract_text() | |
| # Prepare input for MinerU (usually expects a prompt) | |
| input_text = f"question: {question} context: {text[:3000]}" # MinerU has token limit | |
| inputs = tokenizer(input_text, return_tensors="pt", truncation=True) | |
| outputs = model.generate(**inputs, max_new_tokens=128) | |
| answer = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| st.markdown(f"### Answer:\n{answer}") | |