LanguageDetection

Sleeping

File size: 8,881 Bytes

56f497c
cae0132
4c84f29
4b799e0
7748528
56f497c
7748528
4045d37
d99f102
b4c4fbe
86562c6
03b31f1
0f855be
c9af036
e13fba3
f8289a7
 
 
338a6c2
f8289a7
 
 
 
 
6f0b2fb
338a6c2
2d2ddee
 
 
 
c9af036
 
 
 
 
 
 
 
 
e453008
c9af036
338a6c2
2d2ddee
 
 
 
0db40e8
6f0b2fb
 
338a6c2
cb7c22e
338a6c2
cb7c22e
338a6c2
ea07cc3
 
 
 
 
230b561
 
 
 
 
 
 
 
 
 
56052d6
 
 
 
 
3737317
a99d331
9418f40
a99d331
 
a16f80e
 
 
 
 
3737317
a16f80e
 
 
 
230b561
 
f935dba
56052d6
f164dbf
135432f
8880555
135432f
481d87b
135432f
cb7c22e
135432f
014752e
8880555
 
481d87b
 
cb7c22e
 
135432f
f8289a7
426e257
6f0b2fb
 
426e257
0f47030
426e257
0f47030
c9af036
 
2d2ddee
0f47030
82b0a26
 
cb7c22e
29266fd
56052d6
 
a16f80e
 
230b561
 
649882f
28e136e
649882f
 
426e257
481d87b
1ac6eb0
0a073cb
8880555
0f855be
8880555
39e5d80
ac3a794
0f47030
 
d067fbe
d88a3c1
0f47030
6f0b2fb
8880555
ac3a794
1ac6eb0
0f855be
f164dbf
ac3a794
1ac6eb0
8880555
28cef2e
ccec2fc
16105d3

import gradio as gr
import languagecodes
import inspect
import httpx, os
import polars as pl 

df = pl.read_parquet("isolanguages.parquet")
non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows()
# all_langs = languagecodes.iso_languages_byname
all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')}
iso1toall = {iso[1]: (iso[0], iso[2], iso[3]) for iso in non_empty_isos} # {'ro': ('Romanian', 'rum', 'ron')}
DEFAULTS = None
  
libraries = ["langdetect", "py3langid", "langid", "lingua-py", "pycld2", "fastlangdetect", "fasttext", "openlid", "glotlid"]

class Detect():
    def __init__(self, text: str) -> None:
        self.text: str = text
    def langdetect(self) -> list[str, float]:
        from langdetect import detect, detect_langs
        from langdetect import DetectorFactory
        DetectorFactory.seed = 0
        langcode = detect(self.text)
        langecode_probabilities: list[Language] = detect_langs(self.text)
        return [langcode, round(number=langecode_probabilities[0].prob * 100, ndigits=2)]
    def langid(self) -> list[str, float]:
        from langid.langid import LanguageIdentifier, model
        identifier = LanguageIdentifier.from_modelstring(string=model, norm_probs=True)
        idresult: list[str, float] = list(identifier.classify(self.text))
        return [idresult[0], abs(round(number=idresult[1] * 100, ndigits=2))]
    def py3langid(self) -> list[str, float]:
        langs = ["af", "am", "an", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", "cs", "cy",
                 "da", "de", "dz", "el", "en", "eo", "es", "et", "eu", "fa", "fi", "fo", "fr", "ga",
                 "gl", "gu", "he", "hi", "hr", "ht", "hu", "hy", "id", "is", "it", "ja", "jv", "ka",
                 "kk", "km", "kn", "ko", "ku", "ky", "la", "lb", "lo", "lt", "lv", "mg", "mk", "ml",
                 "mn", "mr", "ms", "mt", "nb", "ne", "nl", "nn", "no", "oc", "or", "pa", "pl", "ps",
                 "pt", "qu", "ro", "ru", "rw", "se", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta",
                 "te", "th", "tl", "tr", "ug", "uk", "ur", "vi", "vo", "wa", "xh", "zh", "zu"]
        import py3langid
        lang, prob = py3langid.classify(self.text) # unpack the result tuple in variables
        return [lang, abs(round(number=prob, ndigits=2))]       
    def lingua(self) -> list[str, float]:
        from lingua import Language, LanguageDetectorBuilder
        detector: LanguageDetector = LanguageDetectorBuilder.from_all_languages().with_preloaded_language_models().build()
        confidence_values: List[ConfidenceValue] = detector.compute_language_confidence_values(self.text)
        return [confidence_values[0].language.iso_code_639_1.name.lower(), "{0:.2f}".format(confidence_values[0].value * 100)]
    def fasttextlangdetect(self) -> list[str, float]: # requires numpy < 2.0
        from ftlangdetect import detect
        result = detect(text=self.text, low_memory=False)
        return [result.get('lang'), abs(round(number=result.get('score') * 100, ndigits=2))]
    def fastlangdetect(self) -> list[str, float]:
        from fast_langdetect import detect
        result = detect(text=self.text, model="auto", k=1)[0]
        return [result.get('lang'), abs(round(number=result.get('score') * 100, ndigits=2))]
    def pycld2(self) -> list[str, float]:
        import pycld2 as cld2
        # available_languages = cld2.LANGUAGES
        isReliable, textBytesFound, details = cld2.detect(self.text, returnVectors=False, bestEffort=True)
        return [details[0][1], round(details[0][2], 2)]
    def parse_fastext(self, repo_id, k=3):
        import fasttext
        from huggingface_hub import hf_hub_download
        model_path = hf_hub_download(repo_id=repo_id, filename="model.bin")
        model = fasttext.load_model(model_path)
        language, probabilities = model.predict(self.text, k=k)
        reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
        long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
        lang_code = all_langs[long_langname][0]
        return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]     
    def fasttext(self) -> list[str, float]:
        import fasttext
        from huggingface_hub import hf_hub_download
        model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
        model = fasttext.load_model(model_path)
        language, probabilities = model.predict(self.text, k=3)
        reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
        long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
        lang_code = all_langs[long_langname][0]
        return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
    def openlid(self) -> list[str, float]:
        import fasttext
        from huggingface_hub import hf_hub_download
        model_path = hf_hub_download(repo_id="laurievb/OpenLID-v2", filename="model.bin")
        model = fasttext.load_model(model_path)
        language, probabilities = model.predict(self.text, k=3)
        reversed_nllb_langs = {v: k for k, v in languagecodes.nllb_language_codes.items()}
        long_langname = reversed_nllb_langs[language[0].replace('__label__', '')]
        lang_code = all_langs[long_langname][0]
        return [lang_code, round(number=probabilities[0] * 100, ndigits=2)]
    def glotlid(self) -> list[str, float]:
        repo_id="cis-lmu/glotlid"
        return self.parse_fastext(repo_id)

def detect_language(input_text: str, used_libraries: list[str]) -> tuple[str, str]:
    """
    Detects the language of the input text.

    Parameters:
        input_text (str): The source text to be translated
        used_libraries: (list[str]) The libraries to be used for detection
    Returns:
        list of lists with: 
            detected_text(str): The language code of the input text
            confidence(float):  The confidence score as float
    
    Example:
        >>> detect_language("Hello world", ["langdetect", "langid", "lingua-py", "fasttextlangdetect", "fastlangdetect"])
        [["en", 1.0]]
    """
    detectinstance = Detect(input_text)
    detections = []
    if not input_text or not used_libraries:
        return [['No input text or library selected', 'Please provide input text and/or select a detection library']]
    if 'langdetect' in used_libraries:
        detections.append(['langdetect'] + detectinstance.langdetect())
    if 'langid' in used_libraries:
        detections.append(['langid'] + detectinstance.langid())
    if 'py3langid' in used_libraries:
        detections.append(['py3langid'] + detectinstance.py3langid())
    if 'lingua-py' in used_libraries:
        detections.append(['lingua-py'] + detectinstance.lingua())
    if 'pycld2' in used_libraries:
        detections.append(['pycld2'] + detectinstance.pycld2())
    if 'fastlangdetect' in used_libraries:
        detections.append(['fastlangdetect'] + detectinstance.fastlangdetect())
    if 'fasttext' in used_libraries:
        detections.append(['fasttext'] + detectinstance.fasttext())
    if 'openlid' in used_libraries:
        detections.append(['openlid'] + detectinstance.openlid())
    if 'glotlid' in used_libraries:
        detections.append(['glotlid'] + detectinstance.glotlid())
    unique_codes = list(set([x[1] for x in detections]))
    unique_languages = [iso1toall[x][0] for x in unique_codes]
    detections.append([f'Unique languages: {unique_languages}', f'Unique codes: {unique_codes}', f'Languages detected: {len(unique_codes)}'])
    print(unique_codes, unique_languages, detections)
    return detections
    
with gr.Blocks() as interface:
    gr.Markdown("### Language Detection with Gradio API and MCP Server")
    input_text = gr.Textbox(label="Enter text to detect:", placeholder="Type/copy text here, maximum 512 characters",
                            autofocus=True, submit_btn='Detect Language', max_length=512) 
    with gr.Row(variant="compact"):
        used_libraries = gr.CheckboxGroup(choices=libraries, value=libraries, label="Detection libraries", show_select_all=True)
    dataframe = gr.Dataframe(
            headers=["Library", "Language  code", "Score"],
            datatype=["str", "str", "number"],
            type='array',
            row_count=len(libraries),
            column_count=3,
            column_limits=(2, 4),
            label='Language detection dataframe'
        )
    input_text.submit(
    fn=detect_language,
    inputs=[input_text, used_libraries],
    outputs=[dataframe]
    )
    
if __name__ == "__main__":
    interface.launch(mcp_server=True, footer_links=["api", "settings"])
    # interface.queue().launch(server_name="0.0.0.0", show_error=True, server_port=7860, mcp_server=True)