Spaces:
Sleeping
Sleeping
Nikhil Singh
commited on
Commit
·
1afbc3a
1
Parent(s):
9fe2871
t5 add
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ from mailparser import parse_from_file
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
from gliner import GLiNER
|
| 6 |
from typing import Dict, Union, List
|
|
|
|
| 7 |
|
| 8 |
import spacy
|
| 9 |
import re
|
|
@@ -11,6 +12,9 @@ import os
|
|
| 11 |
import en_core_web_sm
|
| 12 |
nlp = en_core_web_sm.load()
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
_MODEL = {}
|
| 15 |
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
|
| 16 |
|
|
@@ -58,6 +62,13 @@ def parse_query(sentences: List[str], labels: List[str], threshold: float = 0.3,
|
|
| 58 |
|
| 59 |
return results
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def present(email_file, labels, multilingual=False):
|
| 62 |
email = accept_mail(email_file)
|
| 63 |
cleaned_text = clean_email(email)
|
|
@@ -96,7 +107,8 @@ demo = gr.Interface(
|
|
| 96 |
gr.components.Textbox(label="From"),
|
| 97 |
gr.components.Textbox(label="To"),
|
| 98 |
gr.components.Textbox(label="Date"),
|
| 99 |
-
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities")
|
|
|
|
| 100 |
],
|
| 101 |
title="Email Info Extractor",
|
| 102 |
description="Upload an email file (.eml) to extract its details and detected entities."
|
|
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
from gliner import GLiNER
|
| 6 |
from typing import Dict, Union, List
|
| 7 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
| 8 |
|
| 9 |
import spacy
|
| 10 |
import re
|
|
|
|
| 12 |
import en_core_web_sm
|
| 13 |
nlp = en_core_web_sm.load()
|
| 14 |
|
| 15 |
+
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
| 16 |
+
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
| 17 |
+
|
| 18 |
_MODEL = {}
|
| 19 |
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
|
| 20 |
|
|
|
|
| 62 |
|
| 63 |
return results
|
| 64 |
|
| 65 |
+
def refine_entities_with_t5(entities):
|
| 66 |
+
inputs = "refine entities: " + " ; ".join([f"{entity['text']} as {entity['label']}" for entity in entities])
|
| 67 |
+
input_ids = t5_tokenizer.encode(inputs, return_tensors="pt", add_special_tokens=True)
|
| 68 |
+
outputs = t5_model.generate(input_ids)
|
| 69 |
+
result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 70 |
+
return result
|
| 71 |
+
|
| 72 |
def present(email_file, labels, multilingual=False):
|
| 73 |
email = accept_mail(email_file)
|
| 74 |
cleaned_text = clean_email(email)
|
|
|
|
| 107 |
gr.components.Textbox(label="From"),
|
| 108 |
gr.components.Textbox(label="To"),
|
| 109 |
gr.components.Textbox(label="Date"),
|
| 110 |
+
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities"),
|
| 111 |
+
gr.components.Textbox(label="Refined Entities")
|
| 112 |
],
|
| 113 |
title="Email Info Extractor",
|
| 114 |
description="Upload an email file (.eml) to extract its details and detected entities."
|