# This app allows to upload PDF files, # It compares the speed and output of two versions of a machine learning model (original and optimized/quantized) to classify the content in the PDFs. import gradio as gr import torch import fitz # PyMuPDF for PDF text extraction import pandas as pd import time from transformers import DistilBertTokenizer, DistilBertForSequenceClassification from torch.quantization import quantize_dynamic # Load tokenizer and models tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") original_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") quantized_model = quantize_dynamic(original_model, {torch.nn.Linear}, dtype=torch.qint8) def extract_text_from_pdf(pdf_file): doc = fitz.open(pdf_file.name) # ✔ Use file path from Gradio text = "" for page in doc: text += page.get_text() return text.strip() def classify_text(text, model): inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) prediction = torch.argmax(outputs.logits, dim=1).item() return prediction def analyze_pdfs(pdf_files): if not pdf_files: return "⚠️ Please upload at least one PDF file.", None, None results = [] skipped_files = [] for pdf in pdf_files: filename = pdf.name text = extract_text_from_pdf(pdf) if not text.strip(): skipped_files.append(filename) continue start1 = time.time() pred_orig = classify_text(text, original_model) time1 = round(time.time() - start1, 3) start2 = time.time() pred_quant = classify_text(text, quantized_model) time2 = round(time.time() - start2, 3) results.append({ "File": filename, "Original_Model_Prediction": pred_orig, "Original_Time(s)": time1, "Quantized_Model_Prediction": pred_quant, "Quantized_Time(s)": time2 }) if not results: return "⚠️ No valid text found in any PDF.", None, None df = pd.DataFrame(results) csv_path = "model_comparison.csv" df.to_csv(csv_path, index=False) message = f"✅ Processed {len(results)} file(s)." if skipped_files: message += f" Skipped (no text): {', '.join(skipped_files)}" return message, df, csv_path # Gradio UI with gr.Blocks() as demo: gr.Markdown("## 📊 Compare Original vs Quantized DistilBERT Model on PDFs") pdf_input = gr.File(label="Upload PDF(s)", file_types=[".pdf"], file_count="multiple") run_button = gr.Button("Run Analysis") status = gr.Textbox(label="Status", interactive=False) output_table = gr.Dataframe(label="Results") download_link = gr.File(label="Download CSV") run_button.click( fn=analyze_pdfs, inputs=[pdf_input], outputs=[status, output_table, download_link] # ✔ Now correctly expects 3 outputs ) demo.launch() # Below, upload for 2 pdf as input. With this ony 1 output. #import gradio as gr #import torch #import fitz # PyMuPDF #import pandas as pd #import time #import os # #from transformers import DistilBertTokenizer, DistilBertForSequenceClassification #from torch.quantization import quantize_dynamic # ## Load tokenizer and models #tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") #original_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") # ## Quantize model to reduce size and improve speed #quantized_model = quantize_dynamic(original_model, {torch.nn.Linear}, dtype=torch.qint8) # ## PDF text extraction #def extract_text_from_pdf(pdf_path): # doc = fitz.open(pdf_path) # text = "" # for page in doc: # text += page.get_text() # return text.strip() # ## Classify text #def classify_text(text, model): # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) # with torch.no_grad(): # outputs = model(**inputs) # prediction = torch.argmax(outputs.logits, dim=1).item() # return prediction # 0 or 1 # ## Analyze uploaded PDFs #def analyze_pdfs(pdf_files): # results = [] # # for pdf in pdf_files: # filename = os.path.basename(pdf.name) # text = extract_text_from_pdf(pdf.name) # if not text: # continue # # # Original model # start1 = time.time() # pred_orig = classify_text(text, original_model) # time1 = round(time.time() - start1, 3) # # # Quantized model # start2 = time.time() # pred_quant = classify_text(text, quantized_model) # time2 = round(time.time() - start2, 3) # # results.append({ # "File": filename, # "Original_Model_Prediction": pred_orig, # "Original_Time(s)": time1, # "Quantized_Model_Prediction": pred_quant, # "Quantized_Time(s)": time2 # }) # # df = pd.DataFrame(results) # csv_path = "model_comparison.csv" # Saved in current working directory # df.to_csv(csv_path, index=False) # return df, csv_path # ## Gradio UI #with gr.Blocks() as demo: # gr.Markdown("## 📊 Compare Original vs Quantized BERT Model on PDFs") # with gr.Row(): # pdf_input = gr.File(label="Upload PDF(s)", file_types=[".pdf"], file_count="multiple") # run_button = gr.Button("Run Analysis") # output_table = gr.Dataframe(label="Results") # download_link = gr.File(label="Download CSV") # # run_button.click(fn=analyze_pdfs, inputs=[pdf_input], outputs=[output_table, download_link]) # #demo.launch()