Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from mailparser import parse_from_string | |
| from bs4 import BeautifulSoup | |
| from gliner import GLiNER | |
| from typing import Dict, Union, List | |
| import spacy | |
| import re | |
| import os | |
| import en_core_web_sm | |
| nlp = en_core_web_sm.load() | |
| # nlp = spacy.load("en_core_web_sm") | |
| _MODEL = {} | |
| _CACHE_DIR = os.environ.get("CACHE_DIR", None) | |
| def accept_mail(email_content): | |
| email = parse_from_string(email_content) | |
| return email | |
| def clean_email(email): | |
| soup = BeautifulSoup(email.body, 'html.parser') | |
| for tag in soup.find_all(['style', 'link']): | |
| tag.decompose() | |
| cleaned_text = ' '.join(soup.get_text(separator=' ').split()) | |
| return cleaned_text | |
| def remove_special_characters(text): | |
| pattern = r'[=_-]+' | |
| cleaned_text = re.sub(pattern, '', text) | |
| return cleaned_text | |
| def get_sentences(further_cleaned_text): | |
| doc = nlp(further_cleaned_text) | |
| sentences = [sent.text for sent in doc.sents] | |
| return sentences | |
| # doc = nlp(text) | |
| # entities = [] | |
| # for ent in doc.ents: | |
| # if ent.label_ in labels: | |
| # entities.append((ent.text, ent.label_)) | |
| # return entities | |
| def get_model(model_name: str = None): | |
| if model_name is None: | |
| model_name = "urchade/gliner_base" | |
| global _MODEL | |
| if _MODEL.get(model_name) is None: | |
| _MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR) | |
| return _MODEL[model_name] | |
| def parse_query(sentences: List[str], labels: Union[str, list], threshold: float = 0.3, nested_ner: bool = False, model_name: str = None) -> List[Dict[str, Union[str, list]]]: | |
| model = get_model(model_name) | |
| if isinstance(labels, str): | |
| labels = [i.strip() for i in labels.split(",")] | |
| results = [] | |
| for sentence in sentences: | |
| _entities = model.predict_entities(sentence, labels, threshold=threshold) | |
| entities = [] | |
| for entity in _entities: | |
| entities.append(entity) | |
| results.append({"sentence": sentence, "entities": entities}) | |
| return results | |
| def present(email_content, labels): | |
| email = accept_mail(email_content) | |
| cleaned_text = clean_email(email) | |
| further_cleaned_text = remove_special_characters(cleaned_text) | |
| sentence_list = get_sentences(further_cleaned_text) | |
| # entity_info = '\n'.join([f"{text}: {label}" for text, label in entities]) | |
| result = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base") | |
| email_info = { | |
| "Subject": email.subject, | |
| "From": email.from_, | |
| "To": email.to, | |
| "Date": email.date, | |
| "Cleaned Body": further_cleaned_text, | |
| "Extracted Entities": result | |
| } | |
| return [email_info[key] for key in email_info] | |
| labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", | |
| "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", | |
| "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"] | |
| demo = gr.Interface( | |
| fn=present, | |
| inputs=[ | |
| gr.components.Textbox(label="Email Content"), | |
| gr.components.CheckboxGroup(label="Labels to Detect", choices=labels, default=labels) | |
| ], | |
| outputs=[ | |
| gr.components.Textbox(label="Subject"), | |
| gr.components.Textbox(label="From"), | |
| gr.components.Textbox(label="To"), | |
| gr.components.Textbox(label="Date"), | |
| gr.components.Textbox(label="Cleaned Body"), | |
| gr.components.Textbox(label="Extracted Entities") | |
| ], | |
| title="Email Info", | |
| description="Enter the email content below to view its details and detected entities." | |
| ) | |
| demo.launch() | |