Spaces:

Remostartdev
/

FARMLINGUA_AI_CONVERSATIONAL

Sleeping

App Files Files Community

drrobot9 commited on Jan 27

Commit

e4c5a04

1 Parent(s): 24f02de

Update app/tasks/rag_updater.py

Browse files

Files changed (1) hide show

app/tasks/rag_updater.py +141 -141

app/tasks/rag_updater.py CHANGED Viewed

@@ -1,141 +1,141 @@
-# farmlingua_backend/app/tasks/rag_updater.py
-import os
-import sys
-from datetime import datetime, date
-import logging
-import requests
-from bs4 import BeautifulSoup
-from apscheduler.schedulers.background import BackgroundScheduler
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import SentenceTransformerEmbeddings
-from langchain_community.docstore.document import Document
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from app.utils import config
-BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if BASE_DIR not in sys.path:
-    sys.path.insert(0, BASE_DIR)
-logging.basicConfig(
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    level=logging.INFO
-)
-session = requests.Session()
-def fetch_weather_now():
-    """Fetch current weather for all configured states."""
-    docs = []
-    for state in config.STATES:
-        try:
-            url = "http://api.weatherapi.com/v1/current.json"
-            params = {
-                "key": config.WEATHER_API_KEY,
-                "q": f"{state}, Nigeria",
-                "aqi": "no"
-            }
-            res = session.get(url, params=params, timeout=10)
-            res.raise_for_status()
-            data = res.json()
-            if "current" in data:
-                condition = data['current']['condition']['text']
-                temp_c = data['current']['temp_c']
-                humidity = data['current']['humidity']
-                text = (
-                    f"Weather in {state}: {condition}, "
-                    f"Temperature: {temp_c}°C, Humidity: {humidity}%"
-                )
-                docs.append(Document(
-                    page_content=text,
-                    metadata={
-                        "source": "WeatherAPI",
-                        "location": state,
-                        "timestamp": datetime.utcnow().isoformat()
-                    }
-                ))
-        except Exception as e:
-            logging.error(f"Weather fetch failed for {state}: {e}")
-    return docs
-def fetch_harvestplus_articles():
-    """Fetch ALL today's articles from HarvestPlus site."""
-    try:
-        res = session.get(config.DATA_SOURCES["harvestplus"], timeout=10)
-        res.raise_for_status()
-        soup = BeautifulSoup(res.text, "html.parser")
-        articles = soup.find_all("article")
-        docs = []
-        today_str = date.today().strftime("%Y-%m-%d")
-        for a in articles:
-            content = a.get_text(strip=True)
-            if content and len(content) > 100:
-                if today_str in a.text or True:
-                    docs.append(Document(
-                        page_content=content,
-                        metadata={
-                            "source": "HarvestPlus",
-                            "timestamp": datetime.utcnow().isoformat()
-                        }
-                    ))
-        return docs
-    except Exception as e:
-        logging.error(f"HarvestPlus fetch failed: {e}")
-        return []
-def build_rag_vectorstore(reset=False):
-    job_type = "FULL REBUILD" if reset else "INCREMENTAL UPDATE"
-    logging.info(f"RAG update started — {job_type}")
-    all_docs = fetch_weather_now() + fetch_harvestplus_articles()
-    logging.info(f"Weather docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'WeatherAPI'])}")
-    logging.info(f"News docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'HarvestPlus'])}")
-    if not all_docs:
-        logging.warning("No documents fetched, skipping update")
-        return
-    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
-    chunks = splitter.split_documents(all_docs)
-    embedder = SentenceTransformerEmbeddings(model_name=config.EMBEDDING_MODEL)
-    vectorstore_path = config.LIVE_VS_PATH
-    if reset and os.path.exists(vectorstore_path):
-        for file in os.listdir(vectorstore_path):
-            file_path = os.path.join(vectorstore_path, file)
-            try:
-                os.remove(file_path)
-                logging.info(f"Deleted old file: {file_path}")
-            except Exception as e:
-                logging.error(f"Failed to delete {file_path}: {e}")
-    if os.path.exists(vectorstore_path) and not reset:
-        vs = FAISS.load_local(
-            vectorstore_path,
-            embedder,
-            allow_dangerous_deserialization=True
-        )
-        vs.add_documents(chunks)
-    else:
-        vs = FAISS.from_documents(chunks, embedder)
-    os.makedirs(vectorstore_path, exist_ok=True)
-    vs.save_local(vectorstore_path)
-    logging.info(f"Vectorstore updated at {vectorstore_path}")
-def schedule_updates():
-    scheduler = BackgroundScheduler()
-    scheduler.add_job(build_rag_vectorstore, 'interval', hours=12, kwargs={"reset": False})
-    scheduler.add_job(build_rag_vectorstore, 'interval', days=7, kwargs={"reset": True})
-    scheduler.start()
-    logging.info("Scheduler started — 12-hour incremental updates + weekly full rebuild")
-    return scheduler

+# farmlingua_backend/app/tasks/rag_updater.py
+import os
+import sys
+from datetime import datetime, date
+import logging
+import requests
+from bs4 import BeautifulSoup
+from apscheduler.schedulers.background import BackgroundScheduler
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import SentenceTransformerEmbeddings
+from langchain_community.docstore.document import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from app.utils import config
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if BASE_DIR not in sys.path:
+    sys.path.insert(0, BASE_DIR)
+logging.basicConfig(
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    level=logging.INFO
+)
+session = requests.Session()
+def fetch_weather_now():
+    """Fetch current weather for all configured states."""
+    docs = []
+    for state in config.STATES:
+        try:
+            url = "http://api.weatherapi.com/v1/current.json"
+            params = {
+                "key": config.WEATHER_API_KEY,
+                "q": f"{state}, Nigeria",
+                "aqi": "no"
+            }
+            res = session.get(url, params=params, timeout=10)
+            res.raise_for_status()
+            data = res.json()
+            if "current" in data:
+                condition = data['current']['condition']['text']
+                temp_c = data['current']['temp_c']
+                humidity = data['current']['humidity']
+                text = (
+                    f"Weather in {state}: {condition}, "
+                    f"Temperature: {temp_c}°C, Humidity: {humidity}%"
+                )
+                docs.append(Document(
+                    page_content=text,
+                    metadata={
+                        "source": "WeatherAPI",
+                        "location": state,
+                        "timestamp": datetime.utcnow().isoformat()
+                    }
+                ))
+        except Exception as e:
+            logging.error(f"Weather fetch failed for {state}: {e}")
+    return docs
+def fetch_harvestplus_articles():
+    """Fetch ALL today's articles from HarvestPlus site."""
+    try:
+        res = session.get(config.DATA_SOURCES["harvestplus"], timeout=10)
+        res.raise_for_status()
+        soup = BeautifulSoup(res.text, "html.parser")
+        articles = soup.find_all("article")
+        docs = []
+        today_str = date.today().strftime("%Y-%m-%d")
+        for a in articles:
+            content = a.get_text(strip=True)
+            if content and len(content) > 100:
+                if today_str in a.text or True:
+                    docs.append(Document(
+                        page_content=content,
+                        metadata={
+                            "source": "HarvestPlus",
+                            "timestamp": datetime.utcnow().isoformat()
+                        }
+                    ))
+        return docs
+    except Exception as e:
+        logging.error(f"HarvestPlus fetch failed: {e}")
+        return []
+def build_rag_vectorstore(reset=False):
+    job_type = "FULL REBUILD" if reset else "INCREMENTAL UPDATE"
+    logging.info(f"RAG update started — {job_type}")
+    all_docs = fetch_weather_now() + fetch_harvestplus_articles()
+    logging.info(f"Weather docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'WeatherAPI'])}")
+    logging.info(f"News docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'HarvestPlus'])}")
+    if not all_docs:
+        logging.warning("No documents fetched, skipping update")
+        return
+    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
+    chunks = splitter.split_documents(all_docs)
+    embedder = SentenceTransformerEmbeddings(model_name=config.EMBEDDING_MODEL)
+    vectorstore_path = config.LIVE_VS_PATH
+    if reset and os.path.exists(vectorstore_path):
+        for file in os.listdir(vectorstore_path):
+            file_path = os.path.join(vectorstore_path, file)
+            try:
+                os.remove(file_path)
+                logging.info(f"Deleted old file: {file_path}")
+            except Exception as e:
+                logging.error(f"Failed to delete {file_path}: {e}")
+    if os.path.exists(vectorstore_path) and not reset:
+        vs = FAISS.load_local(
+            vectorstore_path,
+            embedder,
+            allow_dangerous_deserialization=True
+        )
+        vs.add_documents(chunks)
+    else:
+        vs = FAISS.from_documents(chunks, embedder)
+    os.makedirs(vectorstore_path, exist_ok=True)
+    vs.save_local(vectorstore_path)
+    logging.info(f"Vectorstore updated at {vectorstore_path}")
+def schedule_updates():
+    scheduler = BackgroundScheduler()
+    scheduler.add_job(build_rag_vectorstore, 'interval', hours=12, kwargs={"reset": False})
+    scheduler.add_job(build_rag_vectorstore, 'interval', days=7, kwargs={"reset": True})
+    scheduler.start()
+    logging.info("Scheduler started — 12-hour incremental updates + weekly full rebuild")
+    return scheduler