File size: 8,881 Bytes
56f497c cae0132 4c84f29 4b799e0 7748528 56f497c 7748528 4045d37 d99f102 b4c4fbe 86562c6 03b31f1 0f855be c9af036 e13fba3 f8289a7 338a6c2 f8289a7 6f0b2fb 338a6c2 2d2ddee c9af036 e453008 c9af036 338a6c2 2d2ddee 0db40e8 6f0b2fb 338a6c2 cb7c22e 338a6c2 cb7c22e 338a6c2 ea07cc3 230b561 56052d6 3737317 a99d331 9418f40 a99d331 a16f80e 3737317 a16f80e 230b561 f935dba 56052d6 f164dbf 135432f 8880555 135432f 481d87b 135432f cb7c22e 135432f 014752e 8880555 481d87b cb7c22e 135432f f8289a7 426e257 6f0b2fb 426e257 0f47030 426e257 0f47030 c9af036 2d2ddee 0f47030 82b0a26 cb7c22e 29266fd 56052d6 a16f80e 230b561 649882f 28e136e 649882f 426e257 481d87b 1ac6eb0 0a073cb 8880555 0f855be 8880555 39e5d80 ac3a794 0f47030 d067fbe d88a3c1 0f47030 6f0b2fb 8880555 ac3a794 1ac6eb0 0f855be f164dbf ac3a794 1ac6eb0 8880555 28cef2e ccec2fc 16105d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | import gradio as gr
import languagecodes
import inspect
import httpx, os
import polars as pl
df = pl.read_parquet("isolanguages.parquet")
non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
# all_langs = languagecodes.iso_languages_byname
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
DEFAULTS = None
libraries = ["langdetect", "py3langid", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid", "glotlid"]
class Detect():
def __init__(self, text: str) -> None:
self.text: str = text
def langdetect(self) -> list[str, float]:
from langdetect import detect, detect_langs
from langdetect import DetectorFactory
DetectorFactory.seed = 0
langcode = detect(self.text)
langecode_probabilities: list[Language] = detect_langs(self.text)
return [langcode, round(number=langecode_probabilities[0].prob * 100, ndigits=2)]
def langid(self) -> list[str, float]:
from langid.langid import LanguageIdentifier, model
identifier = LanguageIdentifier.from_modelstring(string=model, norm_probs=True)
idresult: list[str, float] = list(identifier.classify(self.text))
return [idresult[0], abs(round(number=idresult[1] * 100, ndigits=2))]
def py3langid(self) -> list[str, float]:
langs = ["af", "am", "an", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", "cy",
"da", "de", "dz", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fo", "fr", "ga",
"gl", "gu", "he", "hi", "hr", "ht", "hu", "hy", "id", "is", "it", "ja", "jv", "ka",
"kk", "km", "kn", "ko", "ku", "ky", "la", "lb", "lo", "lt", "lv", "mg", "mk", "ml",
"mn", "mr", "ms", "mt", "nb", "ne", "nl", "nn", "no", "oc", "or", "pa", "pl", "ps",
"pt", "qu", "ro", "ru", "rw", "se", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta",
"te", "th", "tl", "tr", "ug", "uk", "ur", "vi", "vo", "wa", "xh", "zh", "zu"]
import py3langid
lang, prob = py3langid.classify(self.text) # unpack the result tuple in variables
return [lang, abs(round(number=prob, ndigits=2))]
def lingua(self) -> list[str, float]:
from lingua import Language, LanguageDetectorBuilder
detector: LanguageDetector = LanguageDetectorBuilder.from_all_languages().with_preloaded_language_models().build()
confidence_values: List[ConfidenceValue] = detector.compute_language_confidence_values(self.text)
return [confidence_values[0].language.iso_code_639_1.name.lower(), "{0:.2f}".format(confidence_values[0].value * 100)]
def fasttextlangdetect(self) -> list[str, float]: # requires numpy < 2.0
from ftlangdetect import detect
result = detect(text=self.text, low_memory=False)
return [result.get('lang'), abs(round(number=result.get('score') * 100, ndigits=2))]
def fastlangdetect(self) -> list[str, float]:
from fast_langdetect import detect
result = detect(text=self.text, model="auto", k=1)[0]
return [result.get('lang'), abs(round(number=result.get('score') * 100, ndigits=2))]
def pycld2(self) -> list[str, float]:
import pycld2 as cld2
# available_languages = cld2.LANGUAGES
isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True)
return [details[0][1], round(details[0][2], 2)]
def parse_fastext(self, repo_id, k=3):
import fasttext
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id=repo_id, filename="model.bin")
model = fasttext.load_model(model_path)
language, probabilities = model.predict(self.text, k=k)
reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
lang_code = all_langs[long_langname][0]
return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
def fasttext(self) -> list[str, float]:
import fasttext
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)
language, probabilities = model.predict(self.text, k=3)
reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
lang_code = all_langs[long_langname][0]
return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
def openlid(self) -> list[str, float]:
import fasttext
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="laurievb/OpenLID-v2", filename="model.bin")
model = fasttext.load_model(model_path)
language, probabilities = model.predict(self.text, k=3)
reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
lang_code = all_langs[long_langname][0]
return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
def glotlid(self) -> list[str, float]:
repo_id="cis-lmu/glotlid"
return self.parse_fastext(repo_id)
def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]:
"""
Detects the language of the input text.
Parameters:
input_text (str): The source text to be translated
used_libraries: (list[str]) The libraries to be used for detection
Returns:
list of lists with:
detected_text(str): The language code of the input text
confidence(float): The confidence score as float
Example:
>>> detect_language("Hello world", ["langdetect", "langid", "lingua-py", "fasttextlangdetect", "fastlangdetect"])
[["en", 1.0]]
"""
detectinstance = Detect(input_text)
detections = []
if not input_text or not used_libraries:
return [['No input text or library selected', 'Please provide input text and/or select a detection library']]
if 'langdetect' in used_libraries:
detections.append(['langdetect'] + detectinstance.langdetect())
if 'langid' in used_libraries:
detections.append(['langid'] + detectinstance.langid())
if 'py3langid' in used_libraries:
detections.append(['py3langid'] + detectinstance.py3langid())
if 'lingua-py' in used_libraries:
detections.append(['lingua-py'] + detectinstance.lingua())
if 'pycld2' in used_libraries:
detections.append(['pycld2'] + detectinstance.pycld2())
if 'fastlangdetect' in used_libraries:
detections.append(['fastlangdetect'] + detectinstance.fastlangdetect())
if 'fasttext' in used_libraries:
detections.append(['fasttext'] + detectinstance.fasttext())
if 'openlid' in used_libraries:
detections.append(['openlid'] + detectinstance.openlid())
if 'glotlid' in used_libraries:
detections.append(['glotlid'] + detectinstance.glotlid())
unique_codes = list(set([x[1] for x in detections]))
unique_languages = [iso1toall[x][0] for x in unique_codes]
detections.append([f'Unique languages: {unique_languages}', f'Unique codes: {unique_codes}', f'Languages detected: {len(unique_codes)}'])
print(unique_codes, unique_languages, detections)
return detections
with gr.Blocks() as interface:
gr.Markdown("### Language Detection with Gradio API and MCP Server")
input_text = gr.Textbox(label="Enter text to detect:", placeholder="Type/copy text here, maximum 512 characters",
autofocus=True, submit_btn='Detect Language', max_length=512)
with gr.Row(variant="compact"):
used_libraries = gr.CheckboxGroup(choices=libraries, value=libraries, label="Detection libraries", show_select_all=True)
dataframe = gr.Dataframe(
headers=["Library", "Language code", "Score"],
datatype=["str", "str", "number"],
type='array',
row_count=len(libraries),
column_count=3,
column_limits=(2, 4),
label='Language detection dataframe'
)
input_text.submit(
fn=detect_language,
inputs=[input_text, used_libraries],
outputs=[dataframe]
)
if __name__ == "__main__":
interface.launch(mcp_server=True, footer_links=["api", "settings"])
# interface.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860, mcp_server=True) |