Spaces:

nexusbert
/

Aglimate

Paused

App Files Files Community

nexusbert commited on Feb 27

Commit

c7ece81

1 Parent(s): 4695df6

Refactor climate advisory agent to support video input and improve model loading. Update requirements to use the latest transformers from GitHub and add new utility dependencies. Clean up code by removing unnecessary comments and enhancing descriptions for clarity.

Browse files

Files changed (6) hide show

app/agents/climate_agent.py +100 -35
app/agents/crew_pipeline.py +2 -20
app/main.py +4 -10
app/utils/config.py +0 -4
app/utils/model_manager.py +38 -50
requirements.txt +3 -2

app/agents/climate_agent.py CHANGED Viewed

@@ -1,19 +1,23 @@
 """
 Farmer-First Climate-Resilient Advisory Agent
-Uses a multimodal Qwen-VL model to provide climate-resilient advice to
-smallholder farmers based on text, optional photo, and GPS location.
 """
 import io
 import logging
 from typing import Optional, Dict, Any
-from PIL import Image
 import requests
 from app.utils import config
-from app.utils.model_manager import load_multimodal_model
 from app.utils.memory import memory_store
 logging.basicConfig(
@@ -59,19 +63,35 @@ def _build_weather_context(latitude: Optional[float], longitude: Optional[float]
         return ""
 def advise_climate_resilient(
     query: str,
     session_id: str,
     latitude: Optional[float] = None,
     longitude: Optional[float] = None,
     image_bytes: Optional[bytes] = None,
 ) -> Dict[str, Any]:
     """
     Run the Farmer-First Climate-Resilient advisory pipeline with optional image + GPS.
-    All reasoning is handled by a multimodal Qwen-VL model.
     """
     processor, model = load_multimodal_model(config.MULTIMODAL_MODEL_NAME)
     # Conversation history (text-only, 1-hour TTL shared with core pipeline)
     history = memory_store.get_history(session_id) or []
@@ -122,6 +142,7 @@ def advise_climate_resilient(
         else "No photo is attached. Use only the text and any weather/location information.\n"
     )
     prompt_parts = [system_prompt]
     if location_context:
         prompt_parts.append("\nLOCATION & WEATHER CONTEXT:\n")
@@ -141,39 +162,82 @@ def advise_climate_resilient(
     full_prompt = "".join(prompt_parts)
-    # Prepare multimodal inputs
-    inputs = None
-    image = None
-    if image_bytes:
-        try:
-            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        except Exception as e:
-            logging.warning(f"Failed to decode image bytes, falling back to text-only: {e}")
-            image = None
-    if image is not None:
         inputs = processor(
-            text=full_prompt,
-            images=image,
             return_tensors="pt",
         )
     else:
-        inputs = processor(
-            text=full_prompt,
             return_tensors="pt",
         )
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    generated_ids = model.generate(
-        **inputs,
-        max_new_tokens=512,
-        temperature=0.4,
-        top_p=0.9,
-    )
-    outputs = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    answer = (outputs[0] if outputs else "").strip()
     # Save to shared memory history
     history.append({"role": "user", "content": query})
@@ -185,8 +249,9 @@ def advise_climate_resilient(
         "answer": answer,
         "latitude": latitude,
         "longitude": longitude,
-        "used_image": bool(image is not None),
-        "model_used": config.MULTIMODAL_MODEL_NAME,
     }

 """
 Farmer-First Climate-Resilient Advisory Agent
+Uses a multimodal Qwen2-VL model (when available) to provide
+climate-resilient advice to smallholder farmers from text, optional
+photo/video, and GPS location. Falls back to text-only Qwen on
+environments where Qwen2-VL cannot be fully initialized.
 """
 import io
 import logging
+import os
+import tempfile
 from typing import Optional, Dict, Any
 import requests
+from qwen_vl_utils import process_vision_info
 from app.utils import config
+from app.utils.model_manager import load_multimodal_model, load_expert_model
 from app.utils.memory import memory_store
 logging.basicConfig(
         return ""
+def _save_temp_file(data: bytes, suffix: str) -> str:
+    """
+    Save bytes to a temporary file and return a file:// URI for Qwen2-VL.
+    """
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
+    tmp.write(data)
+    tmp.flush()
+    tmp.close()
+    return f"file://{tmp.name}"
 def advise_climate_resilient(
     query: str,
     session_id: str,
     latitude: Optional[float] = None,
     longitude: Optional[float] = None,
     image_bytes: Optional[bytes] = None,
+    video_bytes: Optional[bytes] = None,
 ) -> Dict[str, Any]:
     """
     Run the Farmer-First Climate-Resilient advisory pipeline with optional image + GPS.
+    Tries to use a multimodal Qwen-VL model when available; if the
+    multimodal stack cannot be loaded on this environment, gracefully
+    falls back to text-only Qwen while still using location/weather
+    context.
     """
     processor, model = load_multimodal_model(config.MULTIMODAL_MODEL_NAME)
+    use_multimodal = processor is not None and model is not None
     # Conversation history (text-only, 1-hour TTL shared with core pipeline)
     history = memory_store.get_history(session_id) or []
         else "No photo is attached. Use only the text and any weather/location information.\n"
     )
+    # Build a single user text block that includes context + question.
     prompt_parts = [system_prompt]
     if location_context:
         prompt_parts.append("\nLOCATION & WEATHER CONTEXT:\n")
     full_prompt = "".join(prompt_parts)
+    # Multimodal path (if supported)
+    answer = ""
+    used_image_flag = False
+    used_video_flag = False
+    if use_multimodal:
+        # Build Qwen2-VL messages following official pattern
+        image_uri = _save_temp_file(image_bytes, ".jpg") if image_bytes else None
+        video_uri = _save_temp_file(video_bytes, ".mp4") if video_bytes else None
+        user_content = []
+        if image_uri:
+            user_content.append({"type": "image", "image": image_uri})
+            used_image_flag = True
+        if video_uri:
+            user_content.append(
+                {
+                    "type": "video",
+                    "video": video_uri,
+                    "fps": 1.0,
+                }
+            )
+            used_video_flag = True
+        user_content.append({"type": "text", "text": full_prompt})
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_content},
+        ]
+        text_prompt = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
         inputs = processor(
+            text=[text_prompt],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
             return_tensors="pt",
         )
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            temperature=0.4,
+            top_p=0.9,
+        )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
+        ]
+        outputs = processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        answer = (outputs[0] if outputs else "").strip()
     else:
+        # Fallback: text-only Qwen expert model, still using climate-aware prompt
+        logging.info("Multimodal model unavailable; using text-only expert model for /advise.")
+        tokenizer, text_model = load_expert_model(config.EXPERT_MODEL_NAME, use_quantization=True)
+        inputs = tokenizer(
+            full_prompt,
             return_tensors="pt",
+        ).to(text_model.device)
+        generated_ids = text_model.generate(
+            **inputs,
+            max_new_tokens=512,
+            temperature=0.4,
+            top_p=0.9,
         )
+        answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
     # Save to shared memory history
     history.append({"role": "user", "content": query})
         "answer": answer,
         "latitude": latitude,
         "longitude": longitude,
+        "used_image": bool(used_image_flag),
+        "used_video": bool(used_video_flag),
+        "model_used": config.MULTIMODAL_MODEL_NAME if use_multimodal else config.EXPERT_MODEL_NAME,
     }

app/agents/crew_pipeline.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Aglimate/app/agents/crew_pipeline.py
 import os
 import sys
 import re
@@ -13,21 +12,14 @@ from huggingface_hub import hf_hub_download
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, NllbTokenizer
 from sentence_transformers import SentenceTransformer
 from app.utils import config
-from app.utils.memory import memory_store  # memory module
 from typing import List
-hf_cache = "/models/huggingface"
-os.environ["HF_HOME"] = hf_cache
-os.environ["TRANSFORMERS_CACHE"] = hf_cache
-os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache
-os.makedirs(hf_cache, exist_ok=True)
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if BASE_DIR not in sys.path:
     sys.path.insert(0, BASE_DIR)
-# Lazy loading - models loaded on demand via model_manager
 from app.utils.model_manager import (
     load_expert_model,
     load_translation_model,
@@ -37,9 +29,7 @@ from app.utils.model_manager import (
     get_device
 )
-DEVICE = get_device()  # Always CPU for HuggingFace Spaces
-# Models will be loaded lazily when needed
 _tokenizer = None
 _model = None
 _embedder = None
@@ -50,7 +40,6 @@ _classifier = None
 def get_expert_model():
-    """Lazy load expert model."""
     global _tokenizer, _model
     if _tokenizer is None or _model is None:
         _tokenizer, _model = load_expert_model(config.EXPERT_MODEL_NAME, use_quantization=True)
@@ -58,7 +47,6 @@ def get_expert_model():
 def get_embedder():
-    """Lazy load embedder."""
     global _embedder
     if _embedder is None:
         _embedder = load_embedder(config.EMBEDDING_MODEL)
@@ -66,7 +54,6 @@ def get_embedder():
 def get_lang_identifier():
-    """Lazy load language identifier."""
     global _lang_identifier
     if _lang_identifier is None:
         _lang_identifier = load_lang_identifier(
@@ -77,7 +64,6 @@ def get_lang_identifier():
 def get_translation_model():
-    """Lazy load translation model."""
     global _translation_tokenizer, _translation_model
     if _translation_tokenizer is None or _translation_model is None:
         _translation_tokenizer, _translation_model = load_translation_model(config.TRANSLATION_MODEL_NAME)
@@ -85,7 +71,6 @@ def get_translation_model():
 def get_classifier():
-    """Lazy load classifier."""
     global _classifier
     if _classifier is None:
         _classifier = load_classifier(config.CLASSIFIER_PATH)
@@ -99,8 +84,6 @@ def detect_language(text: str, top_k: int = 1):
     labels, probs = lang_identifier.predict(clean_text, k=top_k)
     return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
-# Translation model loaded lazily via get_translation_model()
 SUPPORTED_LANGS = {
     "eng_Latn": "English",
     "ibo_Latn": "Igbo",
@@ -110,7 +93,6 @@ SUPPORTED_LANGS = {
     "amh_Latn": "Amharic",
 }
-# Text chunking
 _SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
 def chunk_text(text: str, max_len: int = 400) -> List[str]:

 import os
 import sys
 import re
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, NllbTokenizer
 from sentence_transformers import SentenceTransformer
 from app.utils import config
+from app.utils.memory import memory_store
 from typing import List
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 if BASE_DIR not in sys.path:
     sys.path.insert(0, BASE_DIR)
 from app.utils.model_manager import (
     load_expert_model,
     load_translation_model,
     get_device
 )
+DEVICE = get_device()
 _tokenizer = None
 _model = None
 _embedder = None
 def get_expert_model():
     global _tokenizer, _model
     if _tokenizer is None or _model is None:
         _tokenizer, _model = load_expert_model(config.EXPERT_MODEL_NAME, use_quantization=True)
 def get_embedder():
     global _embedder
     if _embedder is None:
         _embedder = load_embedder(config.EMBEDDING_MODEL)
 def get_lang_identifier():
     global _lang_identifier
     if _lang_identifier is None:
         _lang_identifier = load_lang_identifier(
 def get_translation_model():
     global _translation_tokenizer, _translation_model
     if _translation_tokenizer is None or _translation_model is None:
         _translation_tokenizer, _translation_model = load_translation_model(config.TRANSLATION_MODEL_NAME)
 def get_classifier():
     global _classifier
     if _classifier is None:
         _classifier = load_classifier(config.CLASSIFIER_PATH)
     labels, probs = lang_identifier.predict(clean_text, k=top_k)
     return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
 SUPPORTED_LANGS = {
     "eng_Latn": "English",
     "ibo_Latn": "Igbo",
     "amh_Latn": "Amharic",
 }
 _SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
 def chunk_text(text: str, max_len: int = 400) -> List[str]:

app/main.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Aglimate_backend/app/main.py
 import os
 import sys
 import logging
@@ -47,7 +46,6 @@ def startup_event():
 @app.get("/")
 def home():
-    """Health check endpoint."""
     return {
         "status": "Aglimate climate-resilient backend running",
         "version": "2.0.0",
@@ -94,7 +92,7 @@ async def advise_climate_resilient_endpoint(
     ),
     video: Optional[UploadFile] = File(
         None,
-        description="Optional short field video (currently accepted but not yet analyzed; reserved for future use)",
     ),
 ):
     """
@@ -110,9 +108,8 @@ async def advise_climate_resilient_endpoint(
     if not session_id:
         session_id = str(uuid.uuid4())
-    image_bytes = None
-    if photo is not None:
-        image_bytes = await photo.read()
     result = advise_climate_resilient(
         query=query,
@@ -120,12 +117,9 @@ async def advise_climate_resilient_endpoint(
         latitude=latitude,
         longitude=longitude,
         image_bytes=image_bytes,
     )
-    # video is currently accepted but ignored; kept for forward-compatibility
-    if video is not None:
-        result["video_attached"] = True
     return result
 if __name__ == "__main__":

 import os
 import sys
 import logging
 @app.get("/")
 def home():
     return {
         "status": "Aglimate climate-resilient backend running",
         "version": "2.0.0",
     ),
     video: Optional[UploadFile] = File(
         None,
+        description="Optional short field video of the farm (optional)",
     ),
 ):
     """
     if not session_id:
         session_id = str(uuid.uuid4())
+    image_bytes = await photo.read() if photo is not None else None
+    video_bytes = await video.read() if video is not None else None
     result = advise_climate_resilient(
         query=query,
         latitude=latitude,
         longitude=longitude,
         image_bytes=image_bytes,
+        video_bytes=video_bytes,
     )
     return result
 if __name__ == "__main__":

app/utils/config.py CHANGED Viewed

@@ -1,5 +1,3 @@
-#
-# TerraSyncra_backend/app/utils/config.py
 from pathlib import Path
 import os
 import sys
@@ -26,8 +24,6 @@ CLASSIFIER_CONFIDENCE_THRESHOLD = float(os.getenv("CLASSIFIER_CONFIDENCE_THRESHO
 EXPERT_MODEL_NAME = os.getenv("EXPERT_MODEL_NAME", "Qwen/Qwen1.5-1.8B")
-# Multimodal expert model (Qwen-VL) for image-aware advisory
 MULTIMODAL_MODEL_NAME = os.getenv("MULTIMODAL_MODEL_NAME", "Qwen/Qwen2-VL-2B-Instruct")
 LANG_ID_MODEL_REPO = os.getenv("LANG_ID_MODEL_REPO", "facebook/fasttext-language-identification")

 from pathlib import Path
 import os
 import sys
 EXPERT_MODEL_NAME = os.getenv("EXPERT_MODEL_NAME", "Qwen/Qwen1.5-1.8B")
 MULTIMODAL_MODEL_NAME = os.getenv("MULTIMODAL_MODEL_NAME", "Qwen/Qwen2-VL-2B-Instruct")
 LANG_ID_MODEL_REPO = os.getenv("LANG_ID_MODEL_REPO", "facebook/fasttext-language-identification")

app/utils/model_manager.py CHANGED Viewed

@@ -1,8 +1,3 @@
-# TerraSyncra/app/utils/model_manager.py
-"""
-Lazy Model Manager for CPU Optimization
-Loads models on-demand instead of at import time.
-"""
 import os
 import logging
 import torch
@@ -11,7 +6,6 @@ from functools import lru_cache
 logging.basicConfig(level=logging.INFO)
-# Global model cache
 _models = {
     "expert_model": None,
     "expert_tokenizer": None,
@@ -24,22 +18,14 @@ _models = {
     "classifier": None,
 }
-_device = "cpu"  # Force CPU for HuggingFace Spaces
 def get_device():
-    """Always return CPU for HuggingFace Spaces."""
     return _device
 def load_expert_model(model_name: str, use_quantization: bool = True):
-    """
-    Lazy load expert model with optional quantization.
-    Args:
-        model_name: Model identifier
-        use_quantization: Use INT8 quantization for CPU (recommended)
-    """
     if _models["expert_model"] is not None:
         return _models["expert_tokenizer"], _models["expert_model"]
@@ -48,25 +34,20 @@ def load_expert_model(model_name: str, use_quantization: bool = True):
     logging.info(f"Loading expert model ({model_name})...")
-    # Get cache directory from config
     cache_dir = getattr(config, 'hf_cache', '/models/huggingface')
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
-        use_fast=True,  # Use fast tokenizer
         cache_dir=cache_dir
     )
-    # Load model with CPU optimizations
     model_kwargs = {
-        "torch_dtype": torch.float32,  # Use float32 for CPU
         "device_map": "cpu",
         "low_cpu_mem_usage": True,
     }
-    # Note: For CPU, we use float32 (most compatible)
-    # For quantization on CPU, consider using smaller models or ONNX runtime
-    # BitsAndBytesConfig is GPU-only, so we skip it for CPU deployment
     logging.info("Loading model in float32 for CPU compatibility")
     cache_dir = getattr(config, 'hf_cache', '/models/huggingface')
@@ -77,7 +58,7 @@ def load_expert_model(model_name: str, use_quantization: bool = True):
         **model_kwargs
     )
-    model.eval()  # Set to evaluation mode
     _models["expert_model"] = model
     _models["expert_tokenizer"] = tokenizer
@@ -88,43 +69,50 @@ def load_expert_model(model_name: str, use_quantization: bool = True):
 def load_multimodal_model(model_name: str):
     """
-    Lazy load multimodal Qwen-VL model (vision-language).
-    Used for photo-aware advisory.
     """
     if _models["multimodal_model"] is not None:
         return _models["multimodal_processor"], _models["multimodal_model"]
-    # Note: current transformers build on HF Spaces may not expose AutoModelForVision2Seq.
-    # We rely on Qwen's remote code with AutoModelForCausalLM instead.
-    from transformers import AutoProcessor, AutoModelForCausalLM
     from app.utils import config
     logging.info(f"Loading multimodal expert model ({model_name})...")
     cache_dir = getattr(config, "hf_cache", "/models/huggingface")
-    processor = AutoProcessor.from_pretrained(
-        model_name,
-        cache_dir=cache_dir,
-        trust_remote_code=True,
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.float32,
-        cache_dir=cache_dir,
-        device_map="cpu",
-        low_cpu_mem_usage=True,
-        trust_remote_code=True,
-    )
-    model.eval()
-    _models["multimodal_model"] = model
-    _models["multimodal_processor"] = processor
-    logging.info("Multimodal expert model loaded successfully")
-    return processor, model
 def load_translation_model(model_name: str):

 import os
 import logging
 import torch
 logging.basicConfig(level=logging.INFO)
 _models = {
     "expert_model": None,
     "expert_tokenizer": None,
     "classifier": None,
 }
+_device = "cpu"
 def get_device():
     return _device
 def load_expert_model(model_name: str, use_quantization: bool = True):
     if _models["expert_model"] is not None:
         return _models["expert_tokenizer"], _models["expert_model"]
     logging.info(f"Loading expert model ({model_name})...")
     cache_dir = getattr(config, 'hf_cache', '/models/huggingface')
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
+        use_fast=True,
         cache_dir=cache_dir
     )
     model_kwargs = {
+        "torch_dtype": torch.float32,
         "device_map": "cpu",
         "low_cpu_mem_usage": True,
     }
     logging.info("Loading model in float32 for CPU compatibility")
     cache_dir = getattr(config, 'hf_cache', '/models/huggingface')
         **model_kwargs
     )
+    model.eval()
     _models["expert_model"] = model
     _models["expert_tokenizer"] = tokenizer
 def load_multimodal_model(model_name: str):
     """
+    Lazy load multimodal Qwen2-VL model (vision-language).
+    Used for photo/video-aware advisory.
     """
     if _models["multimodal_model"] is not None:
         return _models["multimodal_processor"], _models["multimodal_model"]
+    # With latest transformers + qwen-vl-utils, Qwen2VLForConditionalGeneration
+    # and AutoProcessor support full image/video chat as in official docs.
+    from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
     from app.utils import config
     logging.info(f"Loading multimodal expert model ({model_name})...")
     cache_dir = getattr(config, "hf_cache", "/models/huggingface")
+    try:
+        processor = AutoProcessor.from_pretrained(
+            model_name,
+            cache_dir=cache_dir,
+        )
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32,  # CPU deployment
+            cache_dir=cache_dir,
+            device_map="cpu",
+            low_cpu_mem_usage=True,
+        )
+        model.eval()
+        _models["multimodal_model"] = model
+        _models["multimodal_processor"] = processor
+        logging.info("Multimodal expert model loaded successfully")
+        return processor, model
+    except Exception as e:
+        logging.error(
+            f"Failed to load multimodal model {model_name}: {e}. "
+            "Falling back to text-only expert model."
+        )
+        _models["multimodal_model"] = None
+        _models["multimodal_processor"] = None
+        return None, None
 def load_translation_model(model_name: str):

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@ crewai
 langchain
 langchain-community
 faiss-cpu
-transformers>=4.51.0
 sentence-transformers
 pydantic
 joblib
@@ -21,4 +21,5 @@ sentencepiece
 fasttext
 pillow
 cachetools
-python-multipart

 langchain
 langchain-community
 faiss-cpu
+transformers @ git+https://github.com/huggingface/transformers
 sentence-transformers
 pydantic
 joblib
 fasttext
 pillow
 cachetools
+python-multipart
+qwen-vl-utils