Spaces:
Sleeping
Sleeping
Nikhil Singh
commited on
Commit
·
b10c920
1
Parent(s):
d90af9d
added T5
Browse files- app.py +19 -4
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
|
|
|
|
| 3 |
from mailparser import parse_from_file
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
from gliner import GLiNER
|
|
@@ -11,6 +12,9 @@ import os
|
|
| 11 |
import en_core_web_sm
|
| 12 |
nlp = en_core_web_sm.load()
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
_MODEL = {}
|
| 15 |
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
|
| 16 |
|
|
@@ -58,6 +62,13 @@ def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3,
|
|
| 58 |
|
| 59 |
return results
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def present(email_file, labels, multilingual=False):
|
| 62 |
email = accept_mail(email_file)
|
| 63 |
cleaned_text = clean_email(email)
|
|
@@ -67,16 +78,18 @@ def present(email_file, labels, multilingual=False):
|
|
| 67 |
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
|
| 68 |
|
| 69 |
# Format entities for DataFrame: Convert list of dicts to list of lists
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
|
| 72 |
email_info = {
|
| 73 |
"Subject": email.subject,
|
| 74 |
"From": email.from_,
|
| 75 |
"To": email.to,
|
| 76 |
"Date": email.date,
|
| 77 |
-
"Extracted Entities":
|
| 78 |
}
|
| 79 |
-
return [email_info[key] for key in
|
| 80 |
|
| 81 |
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
|
| 82 |
|
|
@@ -96,8 +109,10 @@ demo = gr.Interface(
|
|
| 96 |
gr.components.Textbox(label="From"),
|
| 97 |
gr.components.Textbox(label="To"),
|
| 98 |
gr.components.Textbox(label="Date"),
|
| 99 |
-
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities")
|
|
|
|
| 100 |
],
|
|
|
|
| 101 |
title="Email Info Extractor",
|
| 102 |
description="Upload an email file (.eml) to extract its details and detected entities."
|
| 103 |
)
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
|
| 3 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
| 4 |
from mailparser import parse_from_file
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
from gliner import GLiNER
|
|
|
|
| 12 |
import en_core_web_sm
|
| 13 |
nlp = en_core_web_sm.load()
|
| 14 |
|
| 15 |
+
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
| 16 |
+
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
| 17 |
+
|
| 18 |
_MODEL = {}
|
| 19 |
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
|
| 20 |
|
|
|
|
| 62 |
|
| 63 |
return results
|
| 64 |
|
| 65 |
+
def refine_entities_with_t5(entities):
|
| 66 |
+
inputs = "refine entities: " + " ; ".join([f"{entity['text']} as {entity['label']}" for entity in entities])
|
| 67 |
+
input_ids = t5_tokenizer.encode(inputs, return_tensors="pt", add_special_tokens=True)
|
| 68 |
+
outputs = t5_model.generate(input_ids)
|
| 69 |
+
result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 70 |
+
return result
|
| 71 |
+
|
| 72 |
def present(email_file, labels, multilingual=False):
|
| 73 |
email = accept_mail(email_file)
|
| 74 |
cleaned_text = clean_email(email)
|
|
|
|
| 78 |
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
|
| 79 |
|
| 80 |
# Format entities for DataFrame: Convert list of dicts to list of lists
|
| 81 |
+
entities = [[entity['text'], entity['label']] for entity in entities]
|
| 82 |
+
|
| 83 |
+
refined_entities = refine_entities_with_t5(entities)
|
| 84 |
|
| 85 |
email_info = {
|
| 86 |
"Subject": email.subject,
|
| 87 |
"From": email.from_,
|
| 88 |
"To": email.to,
|
| 89 |
"Date": email.date,
|
| 90 |
+
"Extracted Entities": refined_entities
|
| 91 |
}
|
| 92 |
+
return [email_info[key] for key in email_info]
|
| 93 |
|
| 94 |
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
|
| 95 |
|
|
|
|
| 109 |
gr.components.Textbox(label="From"),
|
| 110 |
gr.components.Textbox(label="To"),
|
| 111 |
gr.components.Textbox(label="Date"),
|
| 112 |
+
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities"),
|
| 113 |
+
gr.components.Textbox(label="Refined Entities")
|
| 114 |
],
|
| 115 |
+
layout="horizontal",
|
| 116 |
title="Email Info Extractor",
|
| 117 |
description="Upload an email file (.eml) to extract its details and detected entities."
|
| 118 |
)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
gliner
|
|
|
|
| 2 |
mail-parser
|
| 3 |
gradio
|
| 4 |
beautifulsoup4
|
|
|
|
| 1 |
gliner
|
| 2 |
+
transformers
|
| 3 |
mail-parser
|
| 4 |
gradio
|
| 5 |
beautifulsoup4
|