Spaces:
Runtime error
Runtime error
| import torch | |
| from transformers import (AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, | |
| AutoConfig, AutoModelForTokenClassification, | |
| AutoTokenizer, pipeline) | |
| from peft import PeftModel, PeftConfig | |
| import streamlit as st | |
| def load_sentiment_analyzer(): | |
| tokenizer = AutoTokenizer.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2") | |
| model = AutoModelForSequenceClassification.from_pretrained("aliciiavs/sentiment-analysis-whatsapp2") | |
| return tokenizer, model | |
| def load_summarizer(): | |
| config = PeftConfig.from_pretrained("marcelomoreno26/bart-large-samsum-adapter") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large") | |
| tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = PeftModel.from_pretrained(model, "marcelomoreno26/bart-large-samsum-adapter", config=config) | |
| model = model.merge_and_unload() | |
| return tokenizer, model | |
| def load_NER(): | |
| config = AutoConfig.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann") | |
| model = AutoModelForTokenClassification.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann",config=config) | |
| tokenizer = AutoTokenizer.from_pretrained("hannahisrael03/wikineural-multilingual-ner-finetuned-wikiann") | |
| pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="average") | |
| return pipe | |
| def get_sentiment_analysis(text, tokenizer, model): | |
| inputs = tokenizer(text, padding=True, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # Get predicted probabilities and predicted label | |
| probabilities = torch.softmax(outputs.logits, dim=1) | |
| predicted_label = torch.argmax(probabilities, dim=1) | |
| # Convert the predicted label tensor to a Python integer | |
| predicted_label = predicted_label.item() | |
| # Map predicted label index to sentiment label | |
| label_dic = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'} | |
| # Print the predicted sentiment label | |
| return label_dic[predicted_label] | |
| def generate_summary(text, tokenizer, model): | |
| prefix = "summarize: " | |
| encoded_input = tokenizer.encode_plus(prefix + text, return_tensors='pt', add_special_tokens=True) | |
| input_ids = encoded_input['input_ids'] | |
| # Check if input_ids exceed the model's max length | |
| max_length = 512 | |
| if input_ids.shape[1] > max_length: | |
| # Split the input_ids into manageable segments | |
| total_summary = [] | |
| for i in range(0, input_ids.shape[1], max_length - 50): # We use max_length - 50 to allow for some room for the model to generate context | |
| segment_ids = input_ids[:, i:i + max_length] | |
| output_ids = model.generate(segment_ids, max_length=150, num_beams=5, early_stopping=True) | |
| segment_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| total_summary.append(segment_summary) | |
| # Concatenate all segment summaries | |
| summary = ' '.join(total_summary) | |
| else: | |
| # Process as usual | |
| output_ids = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True) | |
| summary = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| return summary | |
| def get_NER(text, pipe): | |
| # Use pipeline to predict NER | |
| results = pipe(text) | |
| # Filter duplicates while retaining the highest score for each entity type and word combination | |
| unique_entities = {} | |
| for ent in results: | |
| key = (ent['entity_group'], ent['word']) | |
| if key not in unique_entities or unique_entities[key]['score'] < ent['score']: | |
| unique_entities[key] = ent | |
| # Prepare the output, sorted by the start position to maintain the order they appear in the text | |
| filtered_results = sorted(unique_entities.values(), key=lambda x: x['start']) | |
| # Format the results for a table display | |
| formatted_results = [[ent['word'], ent['entity_group']] for ent in filtered_results] | |
| filtered_results = [] | |
| for entity in formatted_results: | |
| if entity[1] == 'ORG': | |
| # Split the 'word' by spaces and count the number of words | |
| if len(entity[0].split()) <= 2: | |
| filtered_results.append(entity) | |
| else: | |
| # Add non-ORG entities without filtering | |
| filtered_results.append(entity) | |
| return filtered_results |