| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Load model directly\n", | |
| "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline\n", | |
| "import torch\n", | |
| "import gradio as gr\n", | |
| "from openpyxl import load_workbook\n", | |
| "from numpy import mean\n", | |
| "\n", | |
| "tokenizer = AutoTokenizer.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n", | |
| "model = AutoModelForSeq2SeqLM.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n", | |
| "\n", | |
| "tokenizer_keywords = AutoTokenizer.from_pretrained(\"transformer3/H2-keywordextractor\")\n", | |
| "model_keywords = AutoModelForSeq2SeqLM.from_pretrained(\"transformer3/H2-keywordextractor\")\n", | |
| "\n", | |
| "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", | |
| "# Load the fine-tuned model and tokenizer\n", | |
| "new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n", | |
| "new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n", | |
| "\n", | |
| "\n", | |
| "# Create a classification pipeline\n", | |
| "classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n", | |
| "\n", | |
| "# Add label mapping for sentiment analysis\n", | |
| "label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n", | |
| "\n", | |
| "def parse_xl(file_path):\n", | |
| " cells = []\n", | |
| "\n", | |
| " workbook = load_workbook(filename=file_path)\n", | |
| " for sheet in workbook.worksheets:\n", | |
| " for row in sheet.iter_rows():\n", | |
| " for cell in row:\n", | |
| " if cell.value != None:\n", | |
| " cells.append(cell.value)\n", | |
| "\n", | |
| " return cells\n", | |
| "\n", | |
| "def evaluate(file):\n", | |
| " reviews = parse_xl(file)\n", | |
| " ratings = []\n", | |
| " text = \"\"\n", | |
| "\n", | |
| " for review in reviews:\n", | |
| " ratings.append(int(classifier(review)[0]['label'].split('_')[1]))\n", | |
| " text += review\n", | |
| " text += \" \"\n", | |
| " \n", | |
| " inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n", | |
| " summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=50, max_length=1000)\n", | |
| " summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n", | |
| "\n", | |
| " inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n", | |
| " summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n", | |
| " keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] \n", | |
| "\n", | |
| " return round(mean(ratings), 2), summary, keywords\n", | |
| "\n", | |
| "iface = gr.Interface(\n", | |
| " fn=evaluate,\n", | |
| " inputs=gr.File(label=\"Reviews\", file_types=[\".xlsx\", \".xlsm\", \".xltx\", \".xltm\"]),\n", | |
| " outputs=[gr.Textbox(label=\"Rating\"), gr.Textbox(label=\"Summary\"), gr.Textbox(label=\"Keywords\")],\n", | |
| " title='Summarize Reviews',\n", | |
| " description=\"Evaluate and summarize collection of reviews. Reviews are submitted as an Excel file, where each reviews is in its own cell.\"\n", | |
| ")\n", | |
| "\n", | |
| "iface.launch(share=True)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "SolutionsInPR", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } | |