Spaces:

Remostartdev
/

FARMLINGUA_AI_CONVERSATIONAL

Sleeping

App Files Files Community

drrobot9 commited on Jan 27

Commit

620a683

1 Parent(s): e4c5a04

Update app/agents/crew_pipeline.py

Browse files

Files changed (1) hide show

app/agents/crew_pipeline.py +91 -14

app/agents/crew_pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# farmlingua/app/agents/crew_pipeline.pymemorysection
 import os
 import sys
 import re
@@ -10,13 +10,12 @@ import numpy as np
 import torch
 import fasttext
 from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from sentence_transformers import SentenceTransformer
 from app.utils import config
-from app.utils.memory import memory_store  # memory module
 from typing import List
 hf_cache = "/models/huggingface"
 os.environ["HF_HOME"] = hf_cache
 os.environ["TRANSFORMERS_CACHE"] = hf_cache
@@ -29,13 +28,11 @@ if BASE_DIR not in sys.path:
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     classifier = joblib.load(config.CLASSIFIER_PATH)
 except Exception:
     classifier = None
 print(f"Loading expert model ({config.EXPERT_MODEL_NAME})...")
 tokenizer = AutoTokenizer.from_pretrained(config.EXPERT_MODEL_NAME, use_fast=False)
 model = AutoModelForCausalLM.from_pretrained(
@@ -44,10 +41,9 @@ model = AutoModelForCausalLM.from_pretrained(
     device_map="auto"
 )
 embedder = SentenceTransformer(config.EMBEDDING_MODEL)
-#   language detector
 print(f"Loading FastText language identifier ({config.LANG_ID_MODEL_REPO})...")
 lang_model_path = hf_hub_download(
     repo_id=config.LANG_ID_MODEL_REPO,
@@ -62,11 +58,33 @@ def detect_language(text: str, top_k: int = 1):
     labels, probs = lang_identifier.predict(clean_text, k=top_k)
     return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
-#  Translation model
 print(f"Loading translation model ({config.TRANSLATION_MODEL_NAME})...")
 translation_pipeline = pipeline(
     "translation",
-    model=config.TRANSLATION_MODEL_NAME,
     device=0 if DEVICE == "cuda" else -1,
     max_new_tokens=400,
 )
@@ -80,7 +98,7 @@ SUPPORTED_LANGS = {
     "amh_Latn": "Amharic",
 }
-# Text chunking
 _SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
 def chunk_text(text: str, max_len: int = 400) -> List[str]:
@@ -102,16 +120,75 @@ def chunk_text(text: str, max_len: int = 400) -> List[str]:
     return chunks
 def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_len: int = 400) -> str:
     if not text.strip():
         return text
     chunks = chunk_text(text, max_len=max_chunk_len)
     translated_parts = []
     for chunk in chunks:
-        res = translation_pipeline(chunk, src_lang=src_lang, tgt_lang=tgt_lang)
-        translated_parts.append(res[0]["translation_text"])
     return " ".join(translated_parts).strip()
-#  RAG retrieval
 def retrieve_docs(query: str, vs_path: str):
     if not vs_path or not os.path.exists(vs_path):
         return None

+# farmlingua/app/agents/crew_pipeline.py
 import os
 import sys
 import re
 import torch
 import fasttext
 from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
 from sentence_transformers import SentenceTransformer
 from app.utils import config
+from app.utils.memory import memory_store
 from typing import List
 hf_cache = "/models/huggingface"
 os.environ["HF_HOME"] = hf_cache
 os.environ["TRANSFORMERS_CACHE"] = hf_cache
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     classifier = joblib.load(config.CLASSIFIER_PATH)
 except Exception:
     classifier = None
 print(f"Loading expert model ({config.EXPERT_MODEL_NAME})...")
 tokenizer = AutoTokenizer.from_pretrained(config.EXPERT_MODEL_NAME, use_fast=False)
 model = AutoModelForCausalLM.from_pretrained(
     device_map="auto"
 )
 embedder = SentenceTransformer(config.EMBEDDING_MODEL)
+# Language detector
 print(f"Loading FastText language identifier ({config.LANG_ID_MODEL_REPO})...")
 lang_model_path = hf_hub_download(
     repo_id=config.LANG_ID_MODEL_REPO,
     labels, probs = lang_identifier.predict(clean_text, k=top_k)
     return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
 print(f"Loading translation model ({config.TRANSLATION_MODEL_NAME})...")
+LANG_CODE_MAP = {
+    "eng_Latn": "en",    # English
+    "ibo_Latn": "ig",    # Igbo
+    "yor_Latn": "yo",    # Yoruba
+    "hau_Latn": "ha",    # Hausa
+    "swh_Latn": "sw",    # Swahili
+    "amh_Latn": "am",    # Amharic
+}
+translation_tokenizer = AutoTokenizer.from_pretrained(
+    config.TRANSLATION_MODEL_NAME
+)
+translation_model = AutoModelForSeq2SeqLM.from_pretrained(
+    config.TRANSLATION_MODEL_NAME,
+    device_map="auto" if DEVICE == "cuda" else None
+)
 translation_pipeline = pipeline(
     "translation",
+    model=translation_model,
+    tokenizer=translation_tokenizer,
     device=0 if DEVICE == "cuda" else -1,
     max_new_tokens=400,
 )
     "amh_Latn": "Amharic",
 }
 _SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
 def chunk_text(text: str, max_len: int = 400) -> List[str]:
     return chunks
 def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_len: int = 400) -> str:
+    """Translate text between languages using the model"""
     if not text.strip():
         return text
+    src_code = LANG_CODE_MAP.get(src_lang, "en")
+    tgt_code = LANG_CODE_MAP.get(tgt_lang, "en")
+    if src_code == tgt_code:
+        return text
     chunks = chunk_text(text, max_len=max_chunk_len)
     translated_parts = []
     for chunk in chunks:
+        try:
+            if hasattr(translation_tokenizer, 'lang_code_to_id'):
+                # Set source and target language
+                translation_tokenizer.src_lang = src_code
+                forced_bos_token_id = translation_tokenizer.lang_code_to_id[tgt_code]
+                # Tokenize
+                inputs = translation_tokenizer(chunk, return_tensors="pt")
+                if DEVICE == "cuda":
+                    inputs = {k: v.to(translation_model.device) for k, v in inputs.items()}
+                # Generate translation
+                generated_tokens = translation_model.generate(
+                    **inputs,
+                    forced_bos_token_id=forced_bos_token_id,
+                    max_new_tokens=400
+                )
+                # Decode
+                result = translation_tokenizer.batch_decode(
+                    generated_tokens,
+                    skip_special_tokens=True
+                )[0]
+            else:
+                task_name = f"translation_{src_code}_to_{tgt_code}"
+                try:
+                    specific_pipeline = pipeline(
+                        task_name,
+                        model=translation_model,
+                        tokenizer=translation_tokenizer,
+                        device=0 if DEVICE == "cuda" else -1,
+                        max_new_tokens=400,
+                    )
+                    result = specific_pipeline(chunk)[0]["translation_text"]
+                except:
+                    result = translation_pipeline(
+                        chunk,
+                        src_lang=src_code,
+                        tgt_lang=tgt_code
+                    )[0]["translation_text"]
+            translated_parts.append(result)
+        except Exception as e:
+            print(f"Translation error ({src_code}->{tgt_code}): {e}")
+            translated_parts.append(chunk)
     return " ".join(translated_parts).strip()
 def retrieve_docs(query: str, vs_path: str):
     if not vs_path or not os.path.exists(vs_path):
         return None