Spaces:

Remostartdev
/

FARMLINGUA_AI_CONVERSATIONAL

Sleeping

App Files Files Community

drrobot9 commited on Jan 27

Commit

70ac964

1 Parent(s): 620a683

Update app/agents/crew_pipeline.py

Browse files

Files changed (1) hide show

app/agents/crew_pipeline.py +15 -92

app/agents/crew_pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# farmlingua/app/agents/crew_pipeline.py
 import os
 import sys
 import re
@@ -10,12 +10,13 @@ import numpy as np
 import torch
 import fasttext
 from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM
 from sentence_transformers import SentenceTransformer
 from app.utils import config
-from app.utils.memory import memory_store
 from typing import List
 hf_cache = "/models/huggingface"
 os.environ["HF_HOME"] = hf_cache
 os.environ["TRANSFORMERS_CACHE"] = hf_cache
@@ -28,11 +29,13 @@ if BASE_DIR not in sys.path:
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     classifier = joblib.load(config.CLASSIFIER_PATH)
 except Exception:
     classifier = None
 print(f"Loading expert model ({config.EXPERT_MODEL_NAME})...")
 tokenizer = AutoTokenizer.from_pretrained(config.EXPERT_MODEL_NAME, use_fast=False)
 model = AutoModelForCausalLM.from_pretrained(
@@ -41,9 +44,10 @@ model = AutoModelForCausalLM.from_pretrained(
     device_map="auto"
 )
 embedder = SentenceTransformer(config.EMBEDDING_MODEL)
-# Language detector
 print(f"Loading FastText language identifier ({config.LANG_ID_MODEL_REPO})...")
 lang_model_path = hf_hub_download(
     repo_id=config.LANG_ID_MODEL_REPO,
@@ -58,33 +62,11 @@ def detect_language(text: str, top_k: int = 1):
     labels, probs = lang_identifier.predict(clean_text, k=top_k)
     return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
 print(f"Loading translation model ({config.TRANSLATION_MODEL_NAME})...")
-LANG_CODE_MAP = {
-    "eng_Latn": "en",    # English
-    "ibo_Latn": "ig",    # Igbo
-    "yor_Latn": "yo",    # Yoruba
-    "hau_Latn": "ha",    # Hausa
-    "swh_Latn": "sw",    # Swahili
-    "amh_Latn": "am",    # Amharic
-}
-translation_tokenizer = AutoTokenizer.from_pretrained(
-    config.TRANSLATION_MODEL_NAME
-)
-translation_model = AutoModelForSeq2SeqLM.from_pretrained(
-    config.TRANSLATION_MODEL_NAME,
-    device_map="auto" if DEVICE == "cuda" else None
-)
 translation_pipeline = pipeline(
-    "translation",
-    model=translation_model,
-    tokenizer=translation_tokenizer,
     device=0 if DEVICE == "cuda" else -1,
     max_new_tokens=400,
 )
@@ -98,7 +80,7 @@ SUPPORTED_LANGS = {
     "amh_Latn": "Amharic",
 }
 _SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
 def chunk_text(text: str, max_len: int = 400) -> List[str]:
@@ -120,75 +102,16 @@ def chunk_text(text: str, max_len: int = 400) -> List[str]:
     return chunks
 def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_len: int = 400) -> str:
-    """Translate text between languages using the model"""
     if not text.strip():
         return text
-    src_code = LANG_CODE_MAP.get(src_lang, "en")
-    tgt_code = LANG_CODE_MAP.get(tgt_lang, "en")
-    if src_code == tgt_code:
-        return text
     chunks = chunk_text(text, max_len=max_chunk_len)
     translated_parts = []
     for chunk in chunks:
-        try:
-            if hasattr(translation_tokenizer, 'lang_code_to_id'):
-                # Set source and target language
-                translation_tokenizer.src_lang = src_code
-                forced_bos_token_id = translation_tokenizer.lang_code_to_id[tgt_code]
-                # Tokenize
-                inputs = translation_tokenizer(chunk, return_tensors="pt")
-                if DEVICE == "cuda":
-                    inputs = {k: v.to(translation_model.device) for k, v in inputs.items()}
-                # Generate translation
-                generated_tokens = translation_model.generate(
-                    **inputs,
-                    forced_bos_token_id=forced_bos_token_id,
-                    max_new_tokens=400
-                )
-                # Decode
-                result = translation_tokenizer.batch_decode(
-                    generated_tokens,
-                    skip_special_tokens=True
-                )[0]
-            else:
-                task_name = f"translation_{src_code}_to_{tgt_code}"
-                try:
-                    specific_pipeline = pipeline(
-                        task_name,
-                        model=translation_model,
-                        tokenizer=translation_tokenizer,
-                        device=0 if DEVICE == "cuda" else -1,
-                        max_new_tokens=400,
-                    )
-                    result = specific_pipeline(chunk)[0]["translation_text"]
-                except:
-                    result = translation_pipeline(
-                        chunk,
-                        src_lang=src_code,
-                        tgt_lang=tgt_code
-                    )[0]["translation_text"]
-            translated_parts.append(result)
-        except Exception as e:
-            print(f"Translation error ({src_code}->{tgt_code}): {e}")
-            translated_parts.append(chunk)
     return " ".join(translated_parts).strip()
 def retrieve_docs(query: str, vs_path: str):
     if not vs_path or not os.path.exists(vs_path):
         return None

+ farmlingua/app/agents/crew_pipeline.pymemorysection
 import os
 import sys
 import re
 import torch
 import fasttext
 from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from sentence_transformers import SentenceTransformer
 from app.utils import config
+from app.utils.memory import memory_store  # memory module
 from typing import List
 hf_cache = "/models/huggingface"
 os.environ["HF_HOME"] = hf_cache
 os.environ["TRANSFORMERS_CACHE"] = hf_cache
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     classifier = joblib.load(config.CLASSIFIER_PATH)
 except Exception:
     classifier = None
 print(f"Loading expert model ({config.EXPERT_MODEL_NAME})...")
 tokenizer = AutoTokenizer.from_pretrained(config.EXPERT_MODEL_NAME, use_fast=False)
 model = AutoModelForCausalLM.from_pretrained(
     device_map="auto"
 )
 embedder = SentenceTransformer(config.EMBEDDING_MODEL)
+#   language detector
 print(f"Loading FastText language identifier ({config.LANG_ID_MODEL_REPO})...")
 lang_model_path = hf_hub_download(
     repo_id=config.LANG_ID_MODEL_REPO,
     labels, probs = lang_identifier.predict(clean_text, k=top_k)
     return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
+#  Translation model
 print(f"Loading translation model ({config.TRANSLATION_MODEL_NAME})...")
 translation_pipeline = pipeline(
+    "translation_en_to_fr",
+    model=config.TRANSLATION_MODEL_NAME,
     device=0 if DEVICE == "cuda" else -1,
     max_new_tokens=400,
 )
     "amh_Latn": "Amharic",
 }
+# Text chunking
 _SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
 def chunk_text(text: str, max_len: int = 400) -> List[str]:
     return chunks
 def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_len: int = 400) -> str:
     if not text.strip():
         return text
     chunks = chunk_text(text, max_len=max_chunk_len)
     translated_parts = []
     for chunk in chunks:
+        res = translation_pipeline(chunk, src_lang=src_lang, tgt_lang=tgt_lang)
+        translated_parts.append(res[0]["translation_text"])
     return " ".join(translated_parts).strip()
+#  RAG retrieval
 def retrieve_docs(query: str, vs_path: str):
     if not vs_path or not os.path.exists(vs_path):
         return None