drrobot9 commited on
Commit
b15c6d8
·
1 Parent(s): 0d39360

super initial commit

Browse files
.dockerignore ADDED
File without changes
.dockerigore ADDED
File without changes
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ app/vectorstore/faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ app/vectorstore/live_rag_index/index.faiss filter=lfs diff=lfs merge=lfs -text
38
+ app/venv/bin/python filter=lfs diff=lfs merge=lfs -text
39
+ app/venv/bin/python3 filter=lfs diff=lfs merge=lfs -text
40
+ app/venv/bin/python3.11 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base Image
2
+ FROM python:3.10-slim
3
+
4
+
5
+ ENV DEBIAN_FRONTEND=noninteractive \
6
+ PYTHONUNBUFFERED=1 \
7
+ PYTHONDONTWRITEBYTECODE=1
8
+
9
+
10
+ WORKDIR /code
11
+
12
+ # System Dependencies
13
+ RUN apt-get update && apt-get install -y --no-install-recommends \
14
+ build-essential \
15
+ git \
16
+ curl \
17
+ libopenblas-dev \
18
+ libomp-dev \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+
22
+ COPY requirements.txt .
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Hugging Face + model tools
26
+ RUN pip install --no-cache-dir huggingface-hub sentencepiece accelerate fasttext
27
+
28
+ # Hugging Face cache environment
29
+ ENV HF_HOME=/models/huggingface \
30
+ TRANSFORMERS_CACHE=/models/huggingface \
31
+ HUGGINGFACE_HUB_CACHE=/models/huggingface \
32
+ HF_HUB_CACHE=/models/huggingface
33
+
34
+ # Created cache dir and set permissions
35
+ RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
36
+
37
+ # Pre-download models at build time
38
+ RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='Qwen/Qwen3-4B-Instruct-2507')" \
39
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')" \
40
+ && python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='facebook/fasttext-language-identification', filename='model.bin')" \
41
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='drrobot9/nllb-ig-yo-ha-finetuned')" \
42
+ && find /models/huggingface -name '*.lock' -delete
43
+
44
+ # Preload tokenizers (avoid runtime delays)
45
+ RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('Qwen/Qwen3-4B-Instruct-2507', use_fast=True)" \
46
+ && python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', use_fast=True)" \
47
+ && python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('drrobot9/nllb-ig-yo-ha-finetuned', use_fast=True)"
48
+
49
+ # Copy project files
50
+ COPY . .
51
+
52
+ # Expose FastAPI port
53
+ EXPOSE 7860
54
+
55
+ # Run FastAPI app with uvicorn (1 workers for concurrency)
56
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (166 Bytes). View file
 
app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (154 Bytes). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (3.31 kB). View file
 
app/__pycache__/main.cpython-312.pyc ADDED
Binary file (3.65 kB). View file
 
app/agents/__init__.py ADDED
File without changes
app/agents/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (173 Bytes). View file
 
app/agents/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (161 Bytes). View file
 
app/agents/__pycache__/crew_pipeline.cpython-311.pyc ADDED
Binary file (8.73 kB). View file
 
app/agents/__pycache__/crew_pipeline.cpython-312.pyc ADDED
Binary file (13.6 kB). View file
 
app/agents/crew_pipeline.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua/app/agents/crew_pipeline.pymemorysection
2
+ import os
3
+ import sys
4
+ import re
5
+ import uuid
6
+ import requests
7
+ import joblib
8
+ import faiss
9
+ import numpy as np
10
+ import torch
11
+ import fasttext
12
+ from huggingface_hub import hf_hub_download
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
14
+ from sentence_transformers import SentenceTransformer
15
+ from app.utils import config
16
+ from app.utils.memory import memory_store # memory module
17
+ from typing import List
18
+
19
+
20
+ hf_cache = "/models/huggingface"
21
+ os.environ["HF_HOME"] = hf_cache
22
+ os.environ["TRANSFORMERS_CACHE"] = hf_cache
23
+ os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache
24
+ os.makedirs(hf_cache, exist_ok=True)
25
+
26
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
27
+ if BASE_DIR not in sys.path:
28
+ sys.path.insert(0, BASE_DIR)
29
+
30
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
+
32
+
33
+ try:
34
+ classifier = joblib.load(config.CLASSIFIER_PATH)
35
+ except Exception:
36
+ classifier = None
37
+
38
+
39
+ print(f"Loading expert model ({config.EXPERT_MODEL_NAME})...")
40
+ tokenizer = AutoTokenizer.from_pretrained(config.EXPERT_MODEL_NAME, use_fast=False)
41
+ model = AutoModelForCausalLM.from_pretrained(
42
+ config.EXPERT_MODEL_NAME,
43
+ torch_dtype="auto",
44
+ device_map="auto"
45
+ )
46
+
47
+
48
+ embedder = SentenceTransformer(config.EMBEDDING_MODEL)
49
+
50
+ # language detector
51
+ print(f"Loading FastText language identifier ({config.LANG_ID_MODEL_REPO})...")
52
+ lang_model_path = hf_hub_download(
53
+ repo_id=config.LANG_ID_MODEL_REPO,
54
+ filename=getattr(config, "LANG_ID_MODEL_FILE", "model.bin")
55
+ )
56
+ lang_identifier = fasttext.load_model(lang_model_path)
57
+
58
+ def detect_language(text: str, top_k: int = 1):
59
+ if not text or not text.strip():
60
+ return [("eng_Latn", 1.0)]
61
+ clean_text = text.replace("\n", " ").strip()
62
+ labels, probs = lang_identifier.predict(clean_text, k=top_k)
63
+ return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
64
+
65
+ # Translation model
66
+ print(f"Loading translation model ({config.TRANSLATION_MODEL_NAME})...")
67
+ translation_pipeline = pipeline(
68
+ "translation",
69
+ model=config.TRANSLATION_MODEL_NAME,
70
+ device=0 if DEVICE == "cuda" else -1,
71
+ max_new_tokens=400,
72
+ )
73
+
74
+ SUPPORTED_LANGS = {
75
+ "eng_Latn": "English",
76
+ "ibo_Latn": "Igbo",
77
+ "yor_Latn": "Yoruba",
78
+ "hau_Latn": "Hausa",
79
+ "swh_Latn": "Swahili",
80
+ "amh_Latn": "Amharic",
81
+ }
82
+
83
+ # Text chunking
84
+ _SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
85
+
86
+ def chunk_text(text: str, max_len: int = 400) -> List[str]:
87
+ if not text:
88
+ return []
89
+ sentences = _SENTENCE_SPLIT_RE.split(text)
90
+ chunks, current = [], ""
91
+ for s in sentences:
92
+ if not s:
93
+ continue
94
+ if len(current) + len(s) + 1 <= max_len:
95
+ current = (current + " " + s).strip()
96
+ else:
97
+ if current:
98
+ chunks.append(current.strip())
99
+ current = s.strip()
100
+ if current:
101
+ chunks.append(current.strip())
102
+ return chunks
103
+
104
+ def translate_text(text: str, src_lang: str, tgt_lang: str, max_chunk_len: int = 400) -> str:
105
+ if not text.strip():
106
+ return text
107
+ chunks = chunk_text(text, max_len=max_chunk_len)
108
+ translated_parts = []
109
+ for chunk in chunks:
110
+ res = translation_pipeline(chunk, src_lang=src_lang, tgt_lang=tgt_lang)
111
+ translated_parts.append(res[0]["translation_text"])
112
+ return " ".join(translated_parts).strip()
113
+
114
+ # RAG retrieval
115
+ def retrieve_docs(query: str, vs_path: str):
116
+ if not vs_path or not os.path.exists(vs_path):
117
+ return None
118
+ try:
119
+ index = faiss.read_index(str(vs_path))
120
+ except Exception:
121
+ return None
122
+ query_vec = np.array([embedder.encode(query)], dtype=np.float32)
123
+ D, I = index.search(query_vec, k=3)
124
+ if D[0][0] == 0:
125
+ return None
126
+ meta_path = str(vs_path) + "_meta.npy"
127
+ if os.path.exists(meta_path):
128
+ metadata = np.load(meta_path, allow_pickle=True).item()
129
+ docs = [metadata.get(str(idx), "") for idx in I[0] if str(idx) in metadata]
130
+ docs = [d for d in docs if d]
131
+ return "\n\n".join(docs) if docs else None
132
+ return None
133
+
134
+
135
+ def get_weather(state_name: str) -> str:
136
+ url = "http://api.weatherapi.com/v1/current.json"
137
+ params = {"key": config.WEATHER_API_KEY, "q": f"{state_name}, Nigeria", "aqi": "no"}
138
+ r = requests.get(url, params=params, timeout=10)
139
+ if r.status_code != 200:
140
+ return f"Unable to retrieve weather for {state_name}."
141
+ data = r.json()
142
+ return (
143
+ f"Weather in {state_name}:\n"
144
+ f"- Condition: {data['current']['condition']['text']}\n"
145
+ f"- Temperature: {data['current']['temp_c']}°C\n"
146
+ f"- Humidity: {data['current']['humidity']}%\n"
147
+ f"- Wind: {data['current']['wind_kph']} kph"
148
+ )
149
+
150
+
151
+ def detect_intent(query: str):
152
+ q_lower = (query or "").lower()
153
+ if any(word in q_lower for word in ["weather", "temperature", "rain", "forecast"]):
154
+ for state in getattr(config, "STATES", []):
155
+ if state.lower() in q_lower:
156
+ return "weather", state
157
+ return "weather", None
158
+
159
+ if any(word in q_lower for word in ["latest", "update", "breaking", "news", "current", "predict"]):
160
+ return "live_update", None
161
+
162
+ if hasattr(classifier, "predict") and hasattr(classifier, "predict_proba"):
163
+ try:
164
+ predicted_intent = classifier.predict([query])[0]
165
+ confidence = max(classifier.predict_proba([query])[0])
166
+ if confidence < getattr(config, "CLASSIFIER_CONFIDENCE_THRESHOLD", 0.6):
167
+ return "low_confidence", None
168
+ return predicted_intent, None
169
+ except Exception:
170
+ pass
171
+ return "normal", None
172
+
173
+ # expert runner
174
+ def run_qwen(messages: List[dict], max_new_tokens: int = 1300) -> str:
175
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
176
+ inputs = tokenizer([text], return_tensors="pt").to(model.device)
177
+ generated_ids = model.generate(
178
+ **inputs,
179
+ max_new_tokens=max_new_tokens,
180
+ temperature=0.4,
181
+ repetition_penalty=1.1
182
+ )
183
+ output_ids = generated_ids[0][len(inputs.input_ids[0]):].tolist()
184
+ return tokenizer.decode(output_ids, skip_special_tokens=True).strip()
185
+
186
+ # Memory
187
+ MAX_HISTORY_MESSAGES = getattr(config, "MAX_HISTORY_MESSAGES", 30)
188
+
189
+ def build_messages_from_history(history: List[dict], system_prompt: str) -> List[dict]:
190
+ msgs = [{"role": "system", "content": system_prompt}]
191
+ msgs.extend(history)
192
+ return msgs
193
+
194
+ # Main pipeline
195
+ def run_pipeline(user_query: str, session_id: str = None):
196
+ """
197
+ Run FarmLingua pipeline with per-session memory.
198
+ Each session_id keeps its own history.
199
+ """
200
+ if session_id is None:
201
+ session_id = str(uuid.uuid4()) # fallback unique session
202
+
203
+ # Language detection
204
+ lang_label, prob = detect_language(user_query, top_k=1)[0]
205
+ if lang_label not in SUPPORTED_LANGS:
206
+ lang_label = "eng_Latn"
207
+
208
+ translated_query = (
209
+ translate_text(user_query, src_lang=lang_label, tgt_lang="eng_Latn")
210
+ if lang_label != "eng_Latn"
211
+ else user_query
212
+ )
213
+
214
+ intent, extra = detect_intent(translated_query)
215
+
216
+ # Load conversation history
217
+ history = memory_store.get_history(session_id) or []
218
+ if len(history) > MAX_HISTORY_MESSAGES:
219
+ history = history[-MAX_HISTORY_MESSAGES:]
220
+
221
+
222
+ history.append({"role": "user", "content": translated_query})
223
+
224
+
225
+ system_prompt = (
226
+ "You are DR ROBOT, an AI assistant for Nigerian patient. "
227
+ "Answer directly without repeating the question. "
228
+ "Use clear nigerian-friendly English with emojis . "
229
+ "Avoid jargon and irrelevant details. "
230
+ "If asked who built you, say: 'kelvin jackson built and engineered me to help patient."
231
+ " you are a doctor that breaks things down for patient to understand"
232
+ " you are to diagnose before recommending any pharmaceutical medications."
233
+ " you are a medical consultant a general medicine doctor a mental health doctor."
234
+ " you are to know patient relevant information for diagnosis before recommending what wrong with them."
235
+ " always watch out on ur patients be interactive ask questions be like real human being."
236
+ " if user mistakenly calls u dr roberts or drr ruberts just know they are saying dr robot. "
237
+ " when facial diagnosis data is been sent to you study the data and diagnose the user accurately from the data and tell them what to do after seeing the data."
238
+ " be smart to know what is wrong with the user i permit u to prescribe drugs that are better for that and give dosage that is favourable."
239
+ " you are smarter in medical diagnosing because you have more medical knowledge."
240
+ "not all user will type correctly or completely so be smart to know what a user is saying even if the spellings are wrong or incomplete."
241
+
242
+ )
243
+
244
+
245
+ if intent == "weather" and extra:
246
+ weather_text = get_weather(extra)
247
+ history.append({"role": "user", "content": f"Rewrite this weather update simply for farmers:\n{weather_text}"})
248
+ messages_for_qwen = build_messages_from_history(history, system_prompt)
249
+ english_answer = run_qwen(messages_for_qwen, max_new_tokens=256)
250
+ else:
251
+ if intent == "live_update":
252
+ context = retrieve_docs(translated_query, config.LIVE_VS_PATH)
253
+ if context:
254
+ history.append({"role": "user", "content": f"Latest agricultural updates:\n{context}"})
255
+ if intent == "low_confidence":
256
+ context = retrieve_docs(translated_query, config.STATIC_VS_PATH)
257
+ if context:
258
+ history.append({"role": "user", "content": f"Reference information:\n{context}"})
259
+
260
+ messages_for_qwen = build_messages_from_history(history, system_prompt)
261
+ english_answer = run_qwen(messages_for_qwen, max_new_tokens=700)
262
+
263
+ # Save assistant reply
264
+ history.append({"role": "assistant", "content": english_answer})
265
+ if len(history) > MAX_HISTORY_MESSAGES:
266
+ history = history[-MAX_HISTORY_MESSAGES:]
267
+ memory_store.save_history(session_id, history)
268
+
269
+ # Translate back if needed
270
+ final_answer = (
271
+ translate_text(english_answer, src_lang="eng_Latn", tgt_lang=lang_label)
272
+ if lang_label != "eng_Latn"
273
+ else english_answer
274
+ )
275
+
276
+ return {
277
+ "session_id": session_id,
278
+ "detected_language": SUPPORTED_LANGS.get(lang_label, "Unknown"),
279
+ "answer": final_answer
280
+ }
app/main.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua_backend/app/main.py
2
+ import os
3
+ import sys
4
+ import logging
5
+ import uuid
6
+ from fastapi import FastAPI, Body
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ import uvicorn
9
+
10
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11
+ if BASE_DIR not in sys.path:
12
+ sys.path.insert(0, BASE_DIR)
13
+
14
+ from app.tasks.rag_updater import schedule_updates
15
+ from app.utils import config
16
+ from app.agents.crew_pipeline import run_pipeline
17
+
18
+ logging.basicConfig(
19
+ format="%(asctime)s [%(levelname)s] %(message)s",
20
+ level=logging.INFO
21
+ )
22
+
23
+ app = FastAPI(
24
+ title="doctor robot",
25
+ description="Backend service for DOCTOR ROBOT AI with RAG updates, multilingual support, and expert AI pipeline",
26
+ version="1.2.0"
27
+ )
28
+
29
+ app.add_middleware(
30
+ CORSMiddleware,
31
+ allow_origins=getattr(config, "ALLOWED_ORIGINS", ["*"]),
32
+ allow_credentials=True,
33
+ allow_methods=["*"],
34
+ allow_headers=["*"],
35
+ )
36
+
37
+ @app.on_event("startup")
38
+ def startup_event():
39
+ logging.info("Starting farmlingua AI backend...")
40
+ schedule_updates()
41
+
42
+ @app.get("/")
43
+ def home():
44
+ """Health check endpoint."""
45
+ return {
46
+ "status": "Farmlingua AI backend running",
47
+ "version": "1.2.0",
48
+ "vectorstore_path": config.VECTORSTORE_PATH
49
+ }
50
+
51
+ @app.post("/ask")
52
+ def ask_farmbot(
53
+ query: str = Body(..., embed=True),
54
+ session_id: str = Body(None, embed=True)
55
+ ):
56
+ """
57
+ Ask DOCTOR ROBOT AI a farming-related question.
58
+ - Supports Hausa, Igbo, Yoruba, Swahili, Amharic, and English.
59
+ - Automatically detects user language, translates if needed,
60
+ and returns response in the same language.
61
+ - Maintains separate conversation memory per session_id.
62
+ """
63
+ if not session_id:
64
+ session_id = str(uuid.uuid4()) # assign new session if missing
65
+
66
+ logging.info(f"Received query: {query} [session_id={session_id}]")
67
+ answer_data = run_pipeline(query, session_id=session_id)
68
+
69
+ detected_lang = answer_data.get("detected_language", "Unknown")
70
+ logging.info(f"Detected language: {detected_lang}")
71
+
72
+ return {
73
+ "query": query,
74
+ "answer": answer_data.get("answer"),
75
+ "session_id": answer_data.get("session_id"),
76
+ "detected_language": detected_lang
77
+ }
78
+
79
+ if __name__ == "__main__":
80
+ uvicorn.run(
81
+ "app.main:app",
82
+ host="0.0.0.0",
83
+ port=getattr(config, "PORT", 7860),
84
+ reload=bool(getattr(config, "DEBUG", False))
85
+ )
app/models/__init__.py ADDED
File without changes
app/tasks/__init__.py ADDED
File without changes
app/tasks/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file
 
app/tasks/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (160 Bytes). View file
 
app/tasks/__pycache__/rag_updater.cpython-311.pyc ADDED
Binary file (8.43 kB). View file
 
app/tasks/__pycache__/rag_updater.cpython-312.pyc ADDED
Binary file (7.42 kB). View file
 
app/tasks/rag_updater.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua_backend/app/tasks/rag_updater.py
2
+ import os
3
+ import sys
4
+ from datetime import datetime, date
5
+ import logging
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from apscheduler.schedulers.background import BackgroundScheduler
9
+
10
+ from langchain.vectorstores import FAISS
11
+ from langchain.embeddings import SentenceTransformerEmbeddings
12
+ from langchain.docstore.document import Document
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+
15
+ from app.utils import config
16
+
17
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
18
+ if BASE_DIR not in sys.path:
19
+ sys.path.insert(0, BASE_DIR)
20
+
21
+ logging.basicConfig(
22
+ format="%(asctime)s [%(levelname)s] %(message)s",
23
+ level=logging.INFO
24
+ )
25
+
26
+ session = requests.Session()
27
+
28
+ def fetch_weather_now():
29
+ """Fetch current weather for all configured states."""
30
+ docs = []
31
+ for state in config.STATES:
32
+ try:
33
+ url = "http://api.weatherapi.com/v1/current.json"
34
+ params = {
35
+ "key": config.WEATHER_API_KEY,
36
+ "q": f"{state}, Nigeria",
37
+ "aqi": "no"
38
+ }
39
+ res = session.get(url, params=params, timeout=10)
40
+ res.raise_for_status()
41
+ data = res.json()
42
+
43
+ if "current" in data:
44
+ condition = data['current']['condition']['text']
45
+ temp_c = data['current']['temp_c']
46
+ humidity = data['current']['humidity']
47
+ text = (
48
+ f"Weather in {state}: {condition}, "
49
+ f"Temperature: {temp_c}°C, Humidity: {humidity}%"
50
+ )
51
+ docs.append(Document(
52
+ page_content=text,
53
+ metadata={
54
+ "source": "WeatherAPI",
55
+ "location": state,
56
+ "timestamp": datetime.utcnow().isoformat()
57
+ }
58
+ ))
59
+ except Exception as e:
60
+ logging.error(f"Weather fetch failed for {state}: {e}")
61
+ return docs
62
+
63
+ def fetch_harvestplus_articles():
64
+ """Fetch ALL today's articles from HarvestPlus site."""
65
+ try:
66
+ res = session.get(config.DATA_SOURCES["harvestplus"], timeout=10)
67
+ res.raise_for_status()
68
+ soup = BeautifulSoup(res.text, "html.parser")
69
+ articles = soup.find_all("article")
70
+
71
+ docs = []
72
+ today_str = date.today().strftime("%Y-%m-%d")
73
+
74
+ for a in articles:
75
+ content = a.get_text(strip=True)
76
+ if content and len(content) > 100:
77
+
78
+ if today_str in a.text or True:
79
+ docs.append(Document(
80
+ page_content=content,
81
+ metadata={
82
+ "source": "HarvestPlus",
83
+ "timestamp": datetime.utcnow().isoformat()
84
+ }
85
+ ))
86
+ return docs
87
+ except Exception as e:
88
+ logging.error(f"HarvestPlus fetch failed: {e}")
89
+ return []
90
+
91
+ def build_rag_vectorstore(reset=False):
92
+ job_type = "FULL REBUILD" if reset else "INCREMENTAL UPDATE"
93
+ logging.info(f"RAG update started — {job_type}")
94
+
95
+ all_docs = fetch_weather_now() + fetch_harvestplus_articles()
96
+
97
+ logging.info(f"Weather docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'WeatherAPI'])}")
98
+ logging.info(f"News docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'HarvestPlus'])}")
99
+
100
+ if not all_docs:
101
+ logging.warning("No documents fetched, skipping update")
102
+ return
103
+
104
+ splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
105
+ chunks = splitter.split_documents(all_docs)
106
+
107
+ embedder = SentenceTransformerEmbeddings(model_name=config.EMBEDDING_MODEL)
108
+
109
+ vectorstore_path = config.LIVE_VS_PATH
110
+
111
+ if reset and os.path.exists(vectorstore_path):
112
+ for file in os.listdir(vectorstore_path):
113
+ file_path = os.path.join(vectorstore_path, file)
114
+ try:
115
+ os.remove(file_path)
116
+ logging.info(f"Deleted old file: {file_path}")
117
+ except Exception as e:
118
+ logging.error(f"Failed to delete {file_path}: {e}")
119
+
120
+ if os.path.exists(vectorstore_path) and not reset:
121
+ vs = FAISS.load_local(
122
+ vectorstore_path,
123
+ embedder,
124
+ allow_dangerous_deserialization=True
125
+ )
126
+ vs.add_documents(chunks)
127
+ else:
128
+ vs = FAISS.from_documents(chunks, embedder)
129
+
130
+ os.makedirs(vectorstore_path, exist_ok=True)
131
+ vs.save_local(vectorstore_path)
132
+
133
+ logging.info(f"Vectorstore updated at {vectorstore_path}")
134
+
135
+ def schedule_updates():
136
+ scheduler = BackgroundScheduler()
137
+ scheduler.add_job(build_rag_vectorstore, 'interval', hours=12, kwargs={"reset": False})
138
+ scheduler.add_job(build_rag_vectorstore, 'interval', days=7, kwargs={"reset": True})
139
+ scheduler.start()
140
+ logging.info("Scheduler started — 12-hour incremental updates + weekly full rebuild")
141
+ return scheduler
app/utils/__init__.py ADDED
File without changes
app/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file
 
app/utils/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (160 Bytes). View file
 
app/utils/__pycache__/config.cpython-311.pyc ADDED
Binary file (1.85 kB). View file
 
app/utils/__pycache__/config.cpython-312.pyc ADDED
Binary file (2.33 kB). View file
 
app/utils/__pycache__/memory.cpython-312.pyc ADDED
Binary file (1.71 kB). View file
 
app/utils/config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # farmlingua_backend/app/utils/config.py
4
+ from pathlib import Path
5
+ import os
6
+ import sys
7
+
8
+
9
+ BASE_DIR = Path(__file__).resolve().parents[2]
10
+
11
+
12
+ if str(BASE_DIR) not in sys.path:
13
+ sys.path.insert(0, str(BASE_DIR))
14
+
15
+ EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
16
+ STATIC_VS_PATH = BASE_DIR / "app" / "vectorstore" / "faiss_index"
17
+ LIVE_VS_PATH = BASE_DIR / "app" / "vectorstore" / "live_rag_index"
18
+
19
+ VECTORSTORE_PATH = LIVE_VS_PATH
20
+
21
+
22
+ WEATHER_API_KEY = os.getenv("WEATHER_API_KEY", "1eefcad138134d62a1e220003252608")
23
+
24
+
25
+ CLASSIFIER_PATH = BASE_DIR / "app" / "models" / "intent_classifier_v2.joblib"
26
+ CLASSIFIER_CONFIDENCE_THRESHOLD = float(os.getenv("CLASSIFIER_CONFIDENCE_THRESHOLD", "0.6"))
27
+
28
+
29
+ EXPERT_MODEL_NAME = os.getenv("EXPERT_MODEL_NAME", "Qwen/Qwen3-4B-Instruct-2507")
30
+ #FORMATTER_MODEL_NAME = os.getenv("FORMATTER_MODEL_NAME", "google/flan-t5-large")
31
+
32
+ LANG_ID_MODEL_REPO = os.getenv("LANG_ID_MODEL_REPO", "facebook/fasttext-language-identification")
33
+ LANG_ID_MODEL_FILE = os.getenv("LANG_ID_MODEL_FILE", "model.bin")
34
+
35
+ TRANSLATION_MODEL_NAME = os.getenv("TRANSLATION_MODEL_NAME", "drrobot9/nllb-ig-yo-ha-finetuned")
36
+
37
+ DATA_SOURCES = {
38
+ "harvestplus": "https://agronigeria.ng/category/news/",
39
+ }
40
+
41
+ STATES = [
42
+ "Abuja", "Lagos", "Kano", "Kaduna", "Rivers", "Enugu", "Anambra", "Ogun",
43
+ "Oyo", "Delta", "Edo", "Katsina", "Borno", "Benue", "Niger", "Plateau",
44
+ "Bauchi", "Adamawa", "Cross River", "Akwa Ibom", "Ekiti", "Osun", "Ondo",
45
+ "Imo", "Abia", "Ebonyi", "Taraba", "Kebbi", "Zamfara", "Yobe", "Gombe",
46
+ "Sokoto", "Kogi", "Bayelsa", "Nasarawa", "Jigawa"
47
+ ]
48
+
49
+
50
+ hf_cache = "/models/huggingface"
51
+ os.environ["HF_HOME"] = hf_cache
52
+ os.environ["TRANSFORMERS_CACHE"] = hf_cache
53
+ os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache
54
+ os.makedirs(hf_cache, exist_ok=True)
app/utils/memory.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #app/utils/memory.py
2
+
3
+ from cachetools import TTLCache
4
+ from threading import Lock
5
+
6
+ memory_cache = TTLCache(maxsize=10000, ttl=3600)
7
+ lock = Lock()
8
+
9
+
10
+ class MemoryStore:
11
+ """ In memory conversational history with 1-hour expiry."""
12
+ def get_history(self, session_id: str):
13
+ """ Retrieve conversation history list of messages"""
14
+
15
+ with lock:
16
+ return memory_cache.get(session_id, []).copy()
17
+
18
+ def save_history(self,session_id: str, history: list) :
19
+ """ save/overwrite conversation history."""
20
+ with lock:
21
+ memory_cache[session_id] = history.copy()
22
+
23
+ def clear_history(self, session_id: str):
24
+ """Manually clear a session. """
25
+ with lock:
26
+ memory_cache.pop(session_id, None)
27
+
28
+ memory_store = MemoryStore()
app/vectorstore/__init__.py ADDED
File without changes
app/venv/pyvenv.cfg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ home = /usr/bin
2
+ include-system-site-packages = false
3
+ version = 3.11.13
4
+ executable = /usr/bin/python3.11
5
+ command = /usr/bin/python3 -m venv /content/drive/MyDrive/farmlingua_backend/app/venv
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ crewai
2
+ langchain
3
+ langchain-community
4
+ faiss-cpu
5
+ transformers
6
+ sentence-transformers
7
+ pydantic
8
+ joblib
9
+ pyyaml
10
+ torch
11
+ fastapi
12
+ uvicorn
13
+ apscheduler
14
+ numpy<2
15
+ requests
16
+ beautifulsoup4
17
+ huggingface-hub
18
+ python-dotenv
19
+ blobfile
20
+ sentencepiece
21
+ fasttext