Remostart commited on
Commit
e4bacee
·
1 Parent(s): 4ecce49

FARMLINGUA AI CONVERSATIONAL initial commit

Browse files
.dockerignore ADDED
File without changes
.dockerigore ADDED
File without changes
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ app/vectorstore/faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ app/vectorstore/live_rag_index/index.faiss filter=lfs diff=lfs merge=lfs -text
38
+ app/venv/bin/python filter=lfs diff=lfs merge=lfs -text
39
+ app/venv/bin/python3 filter=lfs diff=lfs merge=lfs -text
40
+ app/venv/bin/python3.11 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base Image
2
+ FROM python:3.10-slim
3
+
4
+
5
+ ENV DEBIAN_FRONTEND=noninteractive \
6
+ PYTHONUNBUFFERED=1 \
7
+ PYTHONDONTWRITEBYTECODE=1
8
+
9
+
10
+ WORKDIR /code
11
+
12
+ # System Dependencies
13
+ RUN apt-get update && apt-get install -y --no-install-recommends \
14
+ build-essential \
15
+ git \
16
+ curl \
17
+ libopenblas-dev \
18
+ libomp-dev \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+
22
+ COPY requirements.txt .
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Hugging Face + model tools
26
+ RUN pip install --no-cache-dir huggingface-hub sentencepiece accelerate fasttext
27
+
28
+ # Hugging Face cache environment
29
+ ENV HF_HOME=/models/huggingface \
30
+ TRANSFORMERS_CACHE=/models/huggingface \
31
+ HUGGINGFACE_HUB_CACHE=/models/huggingface \
32
+ HF_HUB_CACHE=/models/huggingface
33
+
34
+ # Created cache dir and set permissions
35
+ RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
36
+
37
+ # Pre-download models at build time
38
+ RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='Qwen/Qwen3-4B-Instruct-2507')" \
39
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')" \
40
+ && python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='facebook/fasttext-language-identification', filename='model.bin')" \
41
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='drrobot9/nllb-ig-yo-ha-finetuned')" \
42
+ && find /models/huggingface -name '*.lock' -delete
43
+
44
+ # Preload tokenizers (avoid runtime delays)
45
+ RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('Qwen/Qwen3-4B-Instruct-2507', use_fast=True)" \
46
+ && python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', use_fast=True)" \
47
+ && python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('drrobot9/nllb-ig-yo-ha-finetuned', use_fast=True)"
48
+
49
+ # Copy project files
50
+ COPY . .
51
+
52
+ # Expose FastAPI port
53
+ EXPOSE 7860
54
+
55
+ # Run FastAPI app with uvicorn (1 workers for concurrency)
56
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (166 Bytes). View file
 
app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (154 Bytes). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (3.31 kB). View file
 
app/__pycache__/main.cpython-312.pyc ADDED
Binary file (3.62 kB). View file
 
app/agents/__init__.py ADDED
File without changes
app/agents/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (173 Bytes). View file
 
app/agents/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (161 Bytes). View file
 
app/agents/__pycache__/crew_pipeline.cpython-311.pyc ADDED
Binary file (8.73 kB). View file
 
app/agents/__pycache__/crew_pipeline.cpython-312.pyc ADDED
Binary file (13.7 kB). View file
 
app/agents/crew_pipeline.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua/app/agents/crew_pipeline.pymemorysection
2
+ import os
3
+ import sys
4
+ import re
5
+ import uuid
6
+ import requests
7
+ import joblib
8
+ import faiss
9
+ import numpy as np
10
+ import torch
11
+ import fasttext
12
+ from huggingface_hub import hf_hub_download
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
14
+ from sentence_transformers import SentenceTransformer
15
+ from app.utils import config
16
+ from app.utils.memory import memory_store # memory module
17
+ from typing import List
18
+
19
+
20
+ hf_cache = "/models/huggingface"
21
+ os.environ["HF_HOME"] = hf_cache
22
+ os.environ["TRANSFORMERS_CACHE"] = hf_cache
23
+ os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache
24
+ os.makedirs(hf_cache, exist_ok=True)
25
+
26
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
27
+ if BASE_DIR not in sys.path:
28
+ sys.path.insert(0, BASE_DIR)
29
+
30
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
+
32
+
33
+ try:
34
+ classifier = joblib.load(config.CLASSIFIER_PATH)
35
+ except Exception:
36
+ classifier = None
37
+
38
+
39
+ print(f"Loading expert model ({config.EXPERT_MODEL_NAME})...")
40
+ tokenizer = AutoTokenizer.from_pretrained(config.EXPERT_MODEL_NAME, use_fast=False)
41
+ model = AutoModelForCausalLM.from_pretrained(
42
+ config.EXPERT_MODEL_NAME,
43
+ torch_dtype="auto",
44
+ device_map="auto"
45
+ )
46
+
47
+
48
+ embedder = SentenceTransformer(config.EMBEDDING_MODEL)
49
+
50
+ # language detector
51
+ print(f"Loading FastText language identifier ({config.LANG_ID_MODEL_REPO})...")
52
+ lang_model_path = hf_hub_download(
53
+ repo_id=config.LANG_ID_MODEL_REPO,
54
+ filename=getattr(config, "LANG_ID_MODEL_FILE", "model.bin")
55
+ )
56
+ lang_identifier = fasttext.load_model(lang_model_path)
57
+
58
+ def detect_language(text: str, top_k: int = 1):
59
+ if not text or not text.strip():
60
+ return [("eng_Latn", 1.0)]
61
+ clean_text = text.replace("\n", " ").strip()
62
+ labels, probs = lang_identifier.predict(clean_text, k=top_k)
63
+ return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
64
+
65
+ # Translation model
66
+ print(f"Loading translation model ({config.TRANSLATION_MODEL_NAME})...")
67
+ translation_pipeline = pipeline(
68
+ "translation",
69
+ model=config.TRANSLATION_MODEL_NAME,
70
+ device=0 if DEVICE == "cuda" else -1,
71
+ max_new_tokens=400,
72
+ )
73
+
74
+ SUPPORTED_LANGS = {
75
+ "eng_Latn": "English",
76
+ "ibo_Latn": "Igbo",
77
+ "yor_Latn": "Yoruba",
78
+ "hau_Latn": "Hausa",
79
+ "swh_Latn": "Swahili",
80
+ "amh_Latn": "Amharic",
81
+ }
82
+
83
+ # Text chunking
84
+ _SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
85
+
86
+ def chunk_text(text: str, max_len: int = 400) -> List[str]:
87
+ if not text:
88
+ return []
89
+ sentences = _SENTENCE_SPLIT_RE.split(text)
90
+ chunks, current = [], ""
91
+ for s in sentences:
92
+ if not s:
93
+ continue
94
+ if len(current) + len(s) + 1 <= max_len:
95
+ current = (current + " " + s).strip()
96
+ else:
97
+ if current:
98
+ chunks.append(current.strip())
99
+ current = s.strip()
100
+ if current:
101
+ chunks.append(current.strip())
102
+ return chunks
103
+
104
+ def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_len: int = 400) -> str:
105
+ if not text.strip():
106
+ return text
107
+ chunks = chunk_text(text, max_len=max_chunk_len)
108
+ translated_parts = []
109
+ for chunk in chunks:
110
+ res = translation_pipeline(chunk, src_lang=src_lang, tgt_lang=tgt_lang)
111
+ translated_parts.append(res[0]["translation_text"])
112
+ return " ".join(translated_parts).strip()
113
+
114
+ # RAG retrieval
115
+ def retrieve_docs(query: str, vs_path: str):
116
+ if not vs_path or not os.path.exists(vs_path):
117
+ return None
118
+ try:
119
+ index = faiss.read_index(str(vs_path))
120
+ except Exception:
121
+ return None
122
+ query_vec = np.array([embedder.encode(query)], dtype=np.float32)
123
+ D, I = index.search(query_vec, k=3)
124
+ if D[0][0] == 0:
125
+ return None
126
+ meta_path = str(vs_path) + "_meta.npy"
127
+ if os.path.exists(meta_path):
128
+ metadata = np.load(meta_path, allow_pickle=True).item()
129
+ docs = [metadata.get(str(idx), "") for idx in I[0] if str(idx) in metadata]
130
+ docs = [d for d in docs if d]
131
+ return "\n\n".join(docs) if docs else None
132
+ return None
133
+
134
+
135
+ def get_weather(state_name: str) -> str:
136
+ url = "http://api.weatherapi.com/v1/current.json"
137
+ params = {"key": config.WEATHER_API_KEY, "q": f"{state_name}, Nigeria", "aqi": "no"}
138
+ r = requests.get(url, params=params, timeout=10)
139
+ if r.status_code != 200:
140
+ return f"Unable to retrieve weather for {state_name}."
141
+ data = r.json()
142
+ return (
143
+ f"Weather in {state_name}:\n"
144
+ f"- Condition: {data['current']['condition']['text']}\n"
145
+ f"- Temperature: {data['current']['temp_c']}°C\n"
146
+ f"- Humidity: {data['current']['humidity']}%\n"
147
+ f"- Wind: {data['current']['wind_kph']} kph"
148
+ )
149
+
150
+
151
+ def detect_intent(query: str):
152
+ q_lower = (query or "").lower()
153
+ if any(word in q_lower for word in ["weather", "temperature", "rain", "forecast"]):
154
+ for state in getattr(config, "STATES", []):
155
+ if state.lower() in q_lower:
156
+ return "weather", state
157
+ return "weather", None
158
+
159
+ if any(word in q_lower for word in ["latest", "update", "breaking", "news", "current", "predict"]):
160
+ return "live_update", None
161
+
162
+ if hasattr(classifier, "predict") and hasattr(classifier, "predict_proba"):
163
+ try:
164
+ predicted_intent = classifier.predict([query])[0]
165
+ confidence = max(classifier.predict_proba([query])[0])
166
+ if confidence < getattr(config, "CLASSIFIER_CONFIDENCE_THRESHOLD", 0.6):
167
+ return "low_confidence", None
168
+ return predicted_intent, None
169
+ except Exception:
170
+ pass
171
+ return "normal", None
172
+
173
+ # expert runner
174
+ def run_qwen(messages: List[dict], max_new_tokens: int = 1300) -> str:
175
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
176
+ inputs = tokenizer([text], return_tensors="pt").to(model.device)
177
+ generated_ids = model.generate(
178
+ **inputs,
179
+ max_new_tokens=max_new_tokens,
180
+ temperature=0.4,
181
+ repetition_penalty=1.1
182
+ )
183
+ output_ids = generated_ids[0][len(inputs.input_ids[0]):].tolist()
184
+ return tokenizer.decode(output_ids, skip_special_tokens=True).strip()
185
+
186
+ # Memory
187
+ MAX_HISTORY_MESSAGES = getattr(config, "MAX_HISTORY_MESSAGES", 30)
188
+
189
+ def build_messages_from_history(history: List[dict], system_prompt: str) -> List[dict]:
190
+ msgs = [{"role": "system", "content": system_prompt}]
191
+ msgs.extend(history)
192
+ return msgs
193
+
194
+ # Main pipeline
195
+ def run_pipeline(user_query: str, session_id: str = None):
196
+ """
197
+ Run FarmLingua pipeline with per-session memory.
198
+ Each session_id keeps its own history.
199
+ """
200
+ if session_id is None:
201
+ session_id = str(uuid.uuid4()) # fallback unique session
202
+
203
+ # Language detection
204
+ lang_label, prob = detect_language(user_query, top_k=1)[0]
205
+ if lang_label not in SUPPORTED_LANGS:
206
+ lang_label = "eng_Latn"
207
+
208
+ translated_query = (
209
+ translate_text(user_query, src_lang=lang_label, tgt_lang="eng_Latn")
210
+ if lang_label != "eng_Latn"
211
+ else user_query
212
+ )
213
+
214
+ intent, extra = detect_intent(translated_query)
215
+
216
+ # Load conversation history
217
+ history = memory_store.get_history(session_id) or []
218
+ if len(history) > MAX_HISTORY_MESSAGES:
219
+ history = history[-MAX_HISTORY_MESSAGES:]
220
+
221
+
222
+ history.append({"role": "user", "content": translated_query})
223
+
224
+
225
+ system_prompt = (
226
+ "You are FarmLingua, an AI assistant for Nigerian farmers. "
227
+ "Answer directly without repeating the question. "
228
+ "Use clear farmer-friendly English with emojis . "
229
+ "Avoid jargon and irrelevant details. "
230
+ "If asked who built you, say: 'KawaFarm LTD developed me to help farmers.'"
231
+
232
+ )
233
+
234
+
235
+ if intent == "weather" and extra:
236
+ weather_text = get_weather(extra)
237
+ history.append({"role": "user", "content": f"Rewrite this weather update simply for farmers:\n{weather_text}"})
238
+ messages_for_qwen = build_messages_from_history(history, system_prompt)
239
+ english_answer = run_qwen(messages_for_qwen, max_new_tokens=256)
240
+ else:
241
+ if intent == "live_update":
242
+ context = retrieve_docs(translated_query, config.LIVE_VS_PATH)
243
+ if context:
244
+ history.append({"role": "user", "content": f"Latest agricultural updates:\n{context}"})
245
+ if intent == "low_confidence":
246
+ context = retrieve_docs(translated_query, config.STATIC_VS_PATH)
247
+ if context:
248
+ history.append({"role": "user", "content": f"Reference information:\n{context}"})
249
+
250
+ messages_for_qwen = build_messages_from_history(history, system_prompt)
251
+ english_answer = run_qwen(messages_for_qwen, max_new_tokens=700)
252
+
253
+ # Save assistant reply
254
+ history.append({"role": "assistant", "content": english_answer})
255
+ if len(history) > MAX_HISTORY_MESSAGES:
256
+ history = history[-MAX_HISTORY_MESSAGES:]
257
+ memory_store.save_history(session_id, history)
258
+
259
+ # Translate back if needed
260
+ final_answer = (
261
+ translate_text(english_answer, src_lang="eng_Latn", tgt_lang=lang_label)
262
+ if lang_label != "eng_Latn"
263
+ else english_answer
264
+ )
265
+
266
+ return {
267
+ "session_id": session_id,
268
+ "detected_language": SUPPORTED_LANGS.get(lang_label, "Unknown"),
269
+ "answer": final_answer
270
+ }
app/main.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua_backend/app/main.py
2
+ import os
3
+ import sys
4
+ import logging
5
+ import uuid
6
+ from fastapi import FastAPI, Body
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ import uvicorn
9
+
10
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11
+ if BASE_DIR not in sys.path:
12
+ sys.path.insert(0, BASE_DIR)
13
+
14
+ from app.tasks.rag_updater import schedule_updates
15
+ from app.utils import config
16
+ from app.agents.crew_pipeline import run_pipeline
17
+
18
+ logging.basicConfig(
19
+ format="%(asctime)s [%(levelname)s] %(message)s",
20
+ level=logging.INFO
21
+ )
22
+
23
+ app = FastAPI(
24
+ title="farmlingua AI Backend",
25
+ description="Backend service for FARMLINGUA AI with RAG updates, multilingual support, and expert AI pipeline",
26
+ version="1.2.0"
27
+ )
28
+
29
+ app.add_middleware(
30
+ CORSMiddleware,
31
+ allow_origins=getattr(config, "ALLOWED_ORIGINS", ["*"]),
32
+ allow_credentials=True,
33
+ allow_methods=["*"],
34
+ allow_headers=["*"],
35
+ )
36
+
37
+ @app.on_event("startup")
38
+ def startup_event():
39
+ logging.info("Starting farmlingua AI backend...")
40
+ schedule_updates()
41
+
42
+ @app.get("/")
43
+ def home():
44
+ """Health check endpoint."""
45
+ return {
46
+ "status": "Farmlingua AI backend running",
47
+ "version": "1.2.0",
48
+ "vectorstore_path": config.VECTORSTORE_PATH
49
+ }
50
+
51
+ @app.post("/ask")
52
+ def ask_farmbot(
53
+ query: str = Body(..., embed=True),
54
+ session_id: str = Body(None, embed=True)
55
+ ):
56
+ """
57
+ Ask farmlingua AI a farming-related question.
58
+ - Supports Hausa, Igbo, Yoruba, Swahili, Amharic, and English.
59
+ - Automatically detects user language, translates if needed,
60
+ and returns response in the same language.
61
+ - Maintains separate conversation memory per session_id.
62
+ """
63
+ if not session_id:
64
+ session_id = str(uuid.uuid4()) # assign new session if missing
65
+
66
+ logging.info(f"Received query: {query} [session_id={session_id}]")
67
+ answer_data = run_pipeline(query, session_id=session_id)
68
+
69
+ detected_lang = answer_data.get("detected_language", "Unknown")
70
+ logging.info(f"Detected language: {detected_lang}")
71
+
72
+ return {
73
+ "query": query,
74
+ "answer": answer_data.get("answer"),
75
+ "session_id": answer_data.get("session_id"),
76
+ "detected_language": detected_lang
77
+ }
78
+
79
+ if __name__ == "__main__":
80
+ uvicorn.run(
81
+ "app.main:app",
82
+ host="0.0.0.0",
83
+ port=getattr(config, "PORT", 7860),
84
+ reload=bool(getattr(config, "DEBUG", False))
85
+ )
app/models/__init__.py ADDED
File without changes
app/tasks/__init__.py ADDED
File without changes
app/tasks/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file
 
app/tasks/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (160 Bytes). View file
 
app/tasks/__pycache__/rag_updater.cpython-311.pyc ADDED
Binary file (8.43 kB). View file
 
app/tasks/__pycache__/rag_updater.cpython-312.pyc ADDED
Binary file (7.42 kB). View file
 
app/tasks/rag_updater.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua_backend/app/tasks/rag_updater.py
2
+ import os
3
+ import sys
4
+ from datetime import datetime, date
5
+ import logging
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from apscheduler.schedulers.background import BackgroundScheduler
9
+
10
+ from langchain.vectorstores import FAISS
11
+ from langchain.embeddings import SentenceTransformerEmbeddings
12
+ from langchain.docstore.document import Document
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+
15
+ from app.utils import config
16
+
17
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
18
+ if BASE_DIR not in sys.path:
19
+ sys.path.insert(0, BASE_DIR)
20
+
21
+ logging.basicConfig(
22
+ format="%(asctime)s [%(levelname)s] %(message)s",
23
+ level=logging.INFO
24
+ )
25
+
26
+ session = requests.Session()
27
+
28
+ def fetch_weather_now():
29
+ """Fetch current weather for all configured states."""
30
+ docs = []
31
+ for state in config.STATES:
32
+ try:
33
+ url = "http://api.weatherapi.com/v1/current.json"
34
+ params = {
35
+ "key": config.WEATHER_API_KEY,
36
+ "q": f"{state}, Nigeria",
37
+ "aqi": "no"
38
+ }
39
+ res = session.get(url, params=params, timeout=10)
40
+ res.raise_for_status()
41
+ data = res.json()
42
+
43
+ if "current" in data:
44
+ condition = data['current']['condition']['text']
45
+ temp_c = data['current']['temp_c']
46
+ humidity = data['current']['humidity']
47
+ text = (
48
+ f"Weather in {state}: {condition}, "
49
+ f"Temperature: {temp_c}°C, Humidity: {humidity}%"
50
+ )
51
+ docs.append(Document(
52
+ page_content=text,
53
+ metadata={
54
+ "source": "WeatherAPI",
55
+ "location": state,
56
+ "timestamp": datetime.utcnow().isoformat()
57
+ }
58
+ ))
59
+ except Exception as e:
60
+ logging.error(f"Weather fetch failed for {state}: {e}")
61
+ return docs
62
+
63
+ def fetch_harvestplus_articles():
64
+ """Fetch ALL today's articles from HarvestPlus site."""
65
+ try:
66
+ res = session.get(config.DATA_SOURCES["harvestplus"], timeout=10)
67
+ res.raise_for_status()
68
+ soup = BeautifulSoup(res.text, "html.parser")
69
+ articles = soup.find_all("article")
70
+
71
+ docs = []
72
+ today_str = date.today().strftime("%Y-%m-%d")
73
+
74
+ for a in articles:
75
+ content = a.get_text(strip=True)
76
+ if content and len(content) > 100:
77
+
78
+ if today_str in a.text or True:
79
+ docs.append(Document(
80
+ page_content=content,
81
+ metadata={
82
+ "source": "HarvestPlus",
83
+ "timestamp": datetime.utcnow().isoformat()
84
+ }
85
+ ))
86
+ return docs
87
+ except Exception as e:
88
+ logging.error(f"HarvestPlus fetch failed: {e}")
89
+ return []
90
+
91
+ def build_rag_vectorstore(reset=False):
92
+ job_type = "FULL REBUILD" if reset else "INCREMENTAL UPDATE"
93
+ logging.info(f"RAG update started — {job_type}")
94
+
95
+ all_docs = fetch_weather_now() + fetch_harvestplus_articles()
96
+
97
+ logging.info(f"Weather docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'WeatherAPI'])}")
98
+ logging.info(f"News docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'HarvestPlus'])}")
99
+
100
+ if not all_docs:
101
+ logging.warning("No documents fetched, skipping update")
102
+ return
103
+
104
+ splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
105
+ chunks = splitter.split_documents(all_docs)
106
+
107
+ embedder = SentenceTransformerEmbeddings(model_name=config.EMBEDDING_MODEL)
108
+
109
+ vectorstore_path = config.LIVE_VS_PATH
110
+
111
+ if reset and os.path.exists(vectorstore_path):
112
+ for file in os.listdir(vectorstore_path):
113
+ file_path = os.path.join(vectorstore_path, file)
114
+ try:
115
+ os.remove(file_path)
116
+ logging.info(f"Deleted old file: {file_path}")
117
+ except Exception as e:
118
+ logging.error(f"Failed to delete {file_path}: {e}")
119
+
120
+ if os.path.exists(vectorstore_path) and not reset:
121
+ vs = FAISS.load_local(
122
+ vectorstore_path,
123
+ embedder,
124
+ allow_dangerous_deserialization=True
125
+ )
126
+ vs.add_documents(chunks)
127
+ else:
128
+ vs = FAISS.from_documents(chunks, embedder)
129
+
130
+ os.makedirs(vectorstore_path, exist_ok=True)
131
+ vs.save_local(vectorstore_path)
132
+
133
+ logging.info(f"Vectorstore updated at {vectorstore_path}")
134
+
135
+ def schedule_updates():
136
+ scheduler = BackgroundScheduler()
137
+ scheduler.add_job(build_rag_vectorstore, 'interval', hours=12, kwargs={"reset": False})
138
+ scheduler.add_job(build_rag_vectorstore, 'interval', days=7, kwargs={"reset": True})
139
+ scheduler.start()
140
+ logging.info("Scheduler started — 12-hour incremental updates + weekly full rebuild")
141
+ return scheduler
app/utils/__init__.py ADDED
File without changes
app/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file
 
app/utils/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (160 Bytes). View file
 
app/utils/__pycache__/config.cpython-311.pyc ADDED
Binary file (1.85 kB). View file
 
app/utils/__pycache__/config.cpython-312.pyc ADDED
Binary file (2.33 kB). View file
 
app/utils/__pycache__/memory.cpython-312.pyc ADDED
Binary file (1.71 kB). View file
 
app/utils/config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # farmlingua_backend/app/utils/config.py
4
+ from pathlib import Path
5
+ import os
6
+ import sys
7
+
8
+
9
+ BASE_DIR = Path(__file__).resolve().parents[2]
10
+
11
+
12
+ if str(BASE_DIR) not in sys.path:
13
+ sys.path.insert(0, str(BASE_DIR))
14
+
15
+ EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
16
+ STATIC_VS_PATH = BASE_DIR / "app" / "vectorstore" / "faiss_index"
17
+ LIVE_VS_PATH = BASE_DIR / "app" / "vectorstore" / "live_rag_index"
18
+
19
+ VECTORSTORE_PATH = LIVE_VS_PATH
20
+
21
+
22
+ WEATHER_API_KEY = os.getenv("WEATHER_API_KEY", "1eefcad138134d62a1e220003252608")
23
+
24
+
25
+ CLASSIFIER_PATH = BASE_DIR / "app" / "models" / "intent_classifier_v2.joblib"
26
+ CLASSIFIER_CONFIDENCE_THRESHOLD = float(os.getenv("CLASSIFIER_CONFIDENCE_THRESHOLD", "0.6"))
27
+
28
+
29
+ EXPERT_MODEL_NAME = os.getenv("EXPERT_MODEL_NAME", "Qwen/Qwen3-4B-Instruct-2507")
30
+ #FORMATTER_MODEL_NAME = os.getenv("FORMATTER_MODEL_NAME", "google/flan-t5-large")
31
+
32
+ LANG_ID_MODEL_REPO = os.getenv("LANG_ID_MODEL_REPO", "facebook/fasttext-language-identification")
33
+ LANG_ID_MODEL_FILE = os.getenv("LANG_ID_MODEL_FILE", "model.bin")
34
+
35
+ TRANSLATION_MODEL_NAME = os.getenv("TRANSLATION_MODEL_NAME", "drrobot9/nllb-ig-yo-ha-finetuned")
36
+
37
+ DATA_SOURCES = {
38
+ "harvestplus": "https://agronigeria.ng/category/news/",
39
+ }
40
+
41
+ STATES = [
42
+ "Abuja", "Lagos", "Kano", "Kaduna", "Rivers", "Enugu", "Anambra", "Ogun",
43
+ "Oyo", "Delta", "Edo", "Katsina", "Borno", "Benue", "Niger", "Plateau",
44
+ "Bauchi", "Adamawa", "Cross River", "Akwa Ibom", "Ekiti", "Osun", "Ondo",
45
+ "Imo", "Abia", "Ebonyi", "Taraba", "Kebbi", "Zamfara", "Yobe", "Gombe",
46
+ "Sokoto", "Kogi", "Bayelsa", "Nasarawa", "Jigawa"
47
+ ]
48
+
49
+
50
+ hf_cache = "/models/huggingface"
51
+ os.environ["HF_HOME"] = hf_cache
52
+ os.environ["TRANSFORMERS_CACHE"] = hf_cache
53
+ os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache
54
+ os.makedirs(hf_cache, exist_ok=True)
app/utils/memory.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #app/utils/memory.py
2
+
3
+ from cachetools import TTLCache
4
+ from threading import Lock
5
+
6
+ memory_cache = TTLCache(maxsize=10000, ttl=3600)
7
+ lock = Lock()
8
+
9
+
10
+ class MemoryStore:
11
+ """ In memory conversational history with 1-hour expiry."""
12
+ def get_history(self, session_id: str):
13
+ """ Retrieve conversation history list of messages"""
14
+
15
+ with lock:
16
+ return memory_cache.get(session_id, []).copy()
17
+
18
+ def save_history(self,session_id: str, history: list) :
19
+ """ save/overwrite conversation history."""
20
+ with lock:
21
+ memory_cache[session_id] = history.copy()
22
+
23
+ def clear_history(self, session_id: str):
24
+ """Manually clear a session. """
25
+ with lock:
26
+ memory_cache.pop(session_id, None)
27
+
28
+ memory_store = MemoryStore()
app/vectorstore/__init__.py ADDED
File without changes
app/venv/pyvenv.cfg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ home = /usr/bin
2
+ include-system-site-packages = false
3
+ version = 3.11.13
4
+ executable = /usr/bin/python3.11
5
+ command = /usr/bin/python3 -m venv /content/drive/MyDrive/farmlingua_backend/app/venv
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ crewai
2
+ langchain
3
+ langchain-community
4
+ faiss-cpu
5
+ transformers
6
+ sentence-transformers
7
+ pydantic
8
+ joblib
9
+ pyyaml
10
+ torch
11
+ fastapi
12
+ uvicorn
13
+ apscheduler
14
+ numpy<2
15
+ requests
16
+ beautifulsoup4
17
+ huggingface-hub
18
+ python-dotenv
19
+ blobfile
20
+ sentencepiece
21
+ fasttext