Spaces:

atkiya110
/

rag_server

Running

App Files Files Community

rag_server / app.py

atkiya110

Update app.py

34ca45d verified 3 days ago

raw

history blame contribute delete

18.9 kB

	import os
	import json
	import asyncio
	import numpy as np
	import uvicorn
	import httpx

	from contextlib import asynccontextmanager
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel

	try:
	import faiss
	FAISS_OK = True
	except ImportError:
	FAISS_OK = False

	try:
	from sentence_transformers import SentenceTransformer
	ST_OK = True
	except ImportError:
	ST_OK = False

	try:
	from rank_bm25 import BM25Okapi
	BM25_OK = True
	except ImportError:
	BM25_OK = False

	try:
	from transformers import pipeline as hf_pipeline
	HF_OK = True
	except ImportError:
	HF_OK = False
	print("[WARN] transformers not installed — generation disabled.")

	# ─────────────────────────────────────────────
	# CONFIG
	# ─────────────────────────────────────────────

	API_BASE = "https://ewu-server.onrender.com/api"
	API_KEY = "i6EDytaX4E2jI6GvZQc0b1RSZHTI5_wVRa2rfL7rLpk"
	API_HEADERS = {"x-api-key": API_KEY}

	GITHUB_BASE = "https://raw.githubusercontent.com/Atkiya/jsonfiles/main/"
	EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	GEN_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

	CHUNK_SIZE = 400
	CHUNK_OVERLAP = 80
	DEVICE = "cpu"

	try:
	import torch
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	except ImportError:
	pass

	API_LIST_ENDPOINTS = [
	"admission-deadlines", "academic-calendar", "grade-scale",
	"tuition-fees", "scholarships", "clubs", "notices", "partnerships",
	"governance", "alumni", "helpdesk", "policies", "proctor-schedule",
	"documents", "newsletters", "programs", "faculty", "departments",
	]

	API_DETAIL_ENDPOINTS = [
	{"list": "programs", "id_field": "id"},
	{"list": "faculty", "id_field": "id"},
	{"list": "documents", "id_field": "slug"},
	]

	GITHUB_FILES = [
	"admission_deadlines.json", "dynamic_admission_process.json",
	"dynamic_admission_requirements.json", "dynamic_tution_fees.json",
	"dynamic_events_workshops.json", "ewu_faculty_complete.json",
	"dynamic_grading.json", "dynamic_facilites.json",
	"ewu_proctor_schedule.json", "ewu_newsletters_complete.json",
	"static_aboutEWU.json", "static_Admin.json",
	"static_AllAvailablePrograms.json", "static_alumni.json",
	"static_campus_life.json", "static_Career_Counseling_Center.json",
	"static_clubs.json", "static_depts.json", "static_facilities.json",
	"static_helpdesk.json", "static_payment_procedure.json",
	"static_Policy.json", "static_Programs.json", "static_Rules.json",
	"static_Sexual_harassment.json", "static_Tuition_fees.json",
	"ma_english.json", "mba_emba.json", "ms_cse.json", "ms_dsa.json",
	"mds.json", "mphil_pharmacy.json", "mss_eco.json",
	"scholarships_and_financial_aids.json",
	"st_ba.json", "st_ce.json", "st_cse.json", "st_ece.json",
	"st_economics.json", "st_eee.json", "st_english.json", "st_geb.json",
	"st_information_studies.json", "st_law.json", "st_math.json",
	"st_pharmacy.json", "st_social_relations.json", "st_sociology.json",
	"syndicate.json", "tesol.json", "ewu_board_of_trustees.json",
	]

	# ─────────────────────────────────────────────
	# APP STATE
	# ─────────────────────────────────────────────

	class AppState:
	embedder = None
	generator = None # TinyLlama pipeline
	documents : list = []
	faiss_index = None
	bm25 = None
	ready : bool = False
	error : str = ""

	state = AppState()

	# ─────────────────────────────────────────────
	# DATA LOADING
	# ─────────────────────────────────────────────

	async def fetch_json(url: str, headers: dict = None, timeout: int = 60):
	try:
	async with httpx.AsyncClient(timeout=timeout) as client:
	r = await client.get(url, headers=headers or {})
	if r.status_code == 200:
	return r.json()
	print(f"[WARN] {url} → HTTP {r.status_code}")
	except Exception as e:
	print(f"[WARN] {url} → {e}")
	return None


	def _unwrap(data) -> list:
	if isinstance(data, list):
	return data
	if isinstance(data, dict):
	for key in ("data", "results", "items"):
	if key in data and isinstance(data[key], list):
	return data[key]
	return [data]
	return []


	async def _wake_api_server():
	"""
	render.com free tier sleeps after inactivity.
	Hit a cheap endpoint first and wait for it to wake up (can take ~50s).
	"""
	print(" [API] Waking render.com server (free tier may be sleeping)…")
	for attempt in range(3):
	result = await fetch_json(f"{API_BASE}/grade-scale", API_HEADERS, timeout=60)
	if result is not None:
	print(" [API] Server awake.")
	return True
	print(f" [API] Wake attempt {attempt+1}/3 failed, retrying…")
	await asyncio.sleep(10)
	print(" [API] Server did not wake — skipping API data.")
	return False


	async def load_api() -> list:
	awake = await _wake_api_server()
	if not awake:
	return []

	list_results = await asyncio.gather(
	*[fetch_json(f"{API_BASE}/{ep}", API_HEADERS) for ep in API_LIST_ENDPOINTS],
	return_exceptions=True,
	)
	docs, list_cache = [], {}
	for ep, data in zip(API_LIST_ENDPOINTS, list_results):
	if not data or isinstance(data, Exception):
	continue
	items = _unwrap(data)
	list_cache[ep] = items
	for item in items:
	text = json.dumps(item, ensure_ascii=False)
	if text.strip():
	docs.append({"content": text, "source": f"api:{ep}"})
	print(f" [API lists] {len(docs)} docs")

	detail_tasks = []
	for cfg in API_DETAIL_ENDPOINTS:
	for item in list_cache.get(cfg["list"], []):
	item_id = item.get(cfg["id_field"]) if isinstance(item, dict) else None
	if item_id is not None:
	url = f"{API_BASE}/{cfg['list']}/{item_id}"
	detail_tasks.append((url, f"api:{cfg['list']}/{item_id}"))

	if detail_tasks:
	detail_results = await asyncio.gather(
	*[fetch_json(url, API_HEADERS) for url, _ in detail_tasks],
	return_exceptions=True,
	)
	n = 0
	for (_, source), data in zip(detail_tasks, detail_results):
	if not data or isinstance(data, Exception):
	continue
	for item in _unwrap(data):
	text = json.dumps(item, ensure_ascii=False)
	if text.strip():
	docs.append({"content": text, "source": source})
	n += 1
	print(f" [API details] {n} docs from {len(detail_tasks)} pages")

	print(f" [API total] {len(docs)} raw docs")
	return docs


	async def load_github() -> list:
	responses = await asyncio.gather(
	*[fetch_json(GITHUB_BASE + f) for f in GITHUB_FILES],
	return_exceptions=True,
	)
	docs = []
	for fname, data in zip(GITHUB_FILES, responses):
	if not data or isinstance(data, Exception):
	continue
	for item in (data if isinstance(data, list) else [data]):
	text = json.dumps(item, ensure_ascii=False)
	if text.strip():
	docs.append({"content": text, "source": f"github:{fname}"})
	print(f" [GitHub] {len(docs)} raw docs")
	return docs

	# ─────────────────────────────────────────────
	# CHUNKING
	# ─────────────────────────────────────────────

	def chunk_documents(docs, size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
	step, out = max(1, size - overlap), []
	for d in docs:
	text = d["content"]
	if not text.strip():
	continue
	if len(text) <= size:
	out.append(d)
	continue
	start = 0
	while start < len(text):
	chunk = text[start:start+size]
	if chunk.strip():
	out.append({"content": chunk, "source": d["source"]})
	start += step
	return out

	# ─────────────────────────────────────────────
	# INDEX BUILDING
	# ─────────────────────────────────────────────

	def build_indexes():
	if not state.documents:
	print("[WARN] No documents to index.")
	return False
	texts = [d["content"] for d in state.documents]

	if FAISS_OK and ST_OK and state.embedder:
	try:
	emb = state.embedder.encode(
	texts, normalize_embeddings=True,
	show_progress_bar=False, batch_size=64,
	)
	emb = np.array(emb, dtype="float32")
	if emb.ndim == 2 and emb.shape[0] > 0:
	state.faiss_index = faiss.IndexFlatIP(emb.shape[1])
	state.faiss_index.add(emb)
	print(f" [FAISS] {state.faiss_index.ntotal} vectors (dim={emb.shape[1]})")
	except Exception as e:
	print(f"[ERROR] FAISS: {e}")
	state.faiss_index = None

	if BM25_OK:
	try:
	tok = [t.lower().split() for t in texts if t.strip()]
	if tok:
	state.bm25 = BM25Okapi(tok)
	print(f" [BM25] {len(tok)} docs")
	except Exception as e:
	print(f"[ERROR] BM25: {e}")
	state.bm25 = None
	return True

	# ─────────────────────────────────────────────
	# RETRIEVAL
	# ─────────────────────────────────────────────

	def search_dense(query, k=8):
	if not state.faiss_index or not state.embedder:
	return []
	try:
	vec = np.array(
	state.embedder.encode([query], normalize_embeddings=True), dtype="float32"
	)
	k_a = min(k, state.faiss_index.ntotal)
	if not k_a:
	return []
	scores, ids = state.faiss_index.search(vec, k_a)
	return [{**state.documents[i], "score": float(s)}
	for s, i in zip(scores[0], ids[0]) if i >= 0]
	except Exception as e:
	print(f"[ERROR] dense: {e}")
	return []


	def search_sparse(query, k=8):
	if not state.bm25 or not state.documents:
	return []
	try:
	tokens = query.lower().split()
	if not tokens:
	return []
	scores = np.array(state.bm25.get_scores(tokens), dtype="float32")
	idx = np.argsort(scores)[::-1][:min(k, len(scores))]
	return [{**state.documents[i], "score": float(scores[i])}
	for i in idx if scores[i] > 0]
	except Exception as e:
	print(f"[ERROR] sparse: {e}")
	return []


	def hybrid_search(query, k=5, alpha=0.65):
	dense = search_dense(query, k * 3)
	sparse = search_sparse(query, k * 3)
	if not dense and not sparse:
	return []
	rrf_k, merged, doc_map = 60, {}, {}
	for rank, d in enumerate(dense):
	key = d["content"]
	merged[key] = merged.get(key, 0.0) + alpha / (rrf_k + rank + 1)
	doc_map[key] = d
	for rank, d in enumerate(sparse):
	key = d["content"]
	merged[key] = merged.get(key, 0.0) + (1 - alpha) / (rrf_k + rank + 1)
	doc_map[key] = d
	return [{**doc_map[c], "rrf_score": round(s, 6)}
	for c, s in sorted(merged.items(), key=lambda x: x[1], reverse=True)[:k]]


	async def async_hybrid_search(query, k=5):
	return await asyncio.to_thread(hybrid_search, query, k)

	# ─────────────────────────────────────────────
	# GENERATION — TinyLlama (local, no API key)
	# ─────────────────────────────────────────────

	SYSTEM_PROMPT = (
	"You are a helpful assistant for East West University (EWU). "
	"Answer using ONLY the context provided. "
	"If the context does not contain enough information, say so honestly. "
	"Be concise and accurate. Do not repeat the context."
	)

	def _run_tinyllama(query: str, context: str) -> str:
	"""
	Synchronous TinyLlama call.
	Uses the chat template format TinyLlama-1.1B-Chat was trained on.
	Always call via asyncio.to_thread — never directly from async code.
	"""
	if state.generator is None:
	return f"[Generator not loaded]\n\nContext:\n{context}"

	# TinyLlama chat template: <\|system\|>...<\|user\|>...<\|assistant\|>
	# Trim context to ~1500 chars so it fits in the 2048-token window
	trimmed_context = context[:1500] + ("…" if len(context) > 1500 else "")

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": f"Context:\n{trimmed_context}\n\nQuestion: {query}"},
	]

	try:
	outputs = state.generator(
	messages,
	max_new_tokens=256,
	do_sample=True,
	temperature=0.3, # low = more factual, less hallucination
	top_p=0.9,
	repetition_penalty=1.1,
	)
	# transformers pipeline returns list of dicts with generated_text
	generated = outputs[0]["generated_text"]

	# generated_text is the full conversation list; grab the last assistant turn
	if isinstance(generated, list):
	for turn in reversed(generated):
	if isinstance(turn, dict) and turn.get("role") == "assistant":
	return turn.get("content", "").strip()

	# Fallback: return raw string
	return str(generated).strip()

	except Exception as e:
	print(f"[ERROR] TinyLlama inference: {e}")
	return f"[Generation error: {e}]"


	async def generate(query: str, context: str) -> str:
	"""Async wrapper — runs TinyLlama in a thread so the event loop stays free."""
	return await asyncio.to_thread(_run_tinyllama, query, context)

	# ─────────────────────────────────────────────
	# BOOT
	# ─────────────────────────────────────────────

	def _load_generator():
	"""Load TinyLlama pipeline. Runs in a thread during boot."""
	if not HF_OK:
	print("[WARN] transformers unavailable — generation disabled.")
	return None
	try:
	print(f" Loading TinyLlama on {DEVICE}…")
	gen = hf_pipeline(
	"text-generation",
	model=GEN_MODEL,
	device=0 if DEVICE == "cuda" else -1, # -1 = CPU for transformers pipeline
	dtype="auto",
	)
	print(" TinyLlama ready.")
	return gen
	except Exception as e:
	print(f"[ERROR] Could not load TinyLlama: {e}")
	return None


	async def _boot():
	try:
	# 1. Load both models concurrently in threads
	print(f"Loading models on {DEVICE}…")
	state.embedder, state.generator = await asyncio.gather(
	asyncio.to_thread(SentenceTransformer, EMBED_MODEL, device=DEVICE) if ST_OK
	else asyncio.to_thread(lambda: None),
	asyncio.to_thread(_load_generator),
	)
	if state.embedder:
	print(" Embedder ready.")

	# 2. Fetch API + GitHub concurrently
	print("Fetching knowledge base (API + GitHub)…")
	api_docs, gh_docs = await asyncio.gather(
	load_api(), load_github(), return_exceptions=False,
	)
	raw_docs = api_docs + gh_docs
	print(f" Combined raw docs: {len(raw_docs)}")

	if not raw_docs:
	print("[WARN] No documents fetched.")

	# 3. Chunk
	state.documents = await asyncio.to_thread(chunk_documents, raw_docs)
	print(f" Total chunks: {len(state.documents)}")

	# 4. Build indexes
	print("Building indexes…")
	await asyncio.to_thread(build_indexes)

	state.ready = True
	print("✓ RAG server ready.")

	except Exception as e:
	state.error = str(e)
	state.ready = False
	print(f"[ERROR] Boot failed: {e}")
	import traceback; traceback.print_exc()


	@asynccontextmanager
	async def lifespan(app: FastAPI):
	boot_task = asyncio.create_task(_boot())
	try:
	yield
	finally:
	boot_task.cancel()
	try:
	await boot_task
	except asyncio.CancelledError:
	pass

	# ─────────────────────────────────────────────
	# APP + ENDPOINTS
	# ─────────────────────────────────────────────

	app = FastAPI(title="EWU RAG Server", lifespan=lifespan)


	class Query(BaseModel):
	query : str
	top_k : int = 5


	@app.post("/rag")
	async def rag_endpoint(q: Query):
	if not state.ready:
	raise HTTPException(503, detail=state.error or "Still initializing — retry shortly.")
	if not q.query.strip():
	raise HTTPException(400, detail="Query must not be empty.")
	results = await async_hybrid_search(q.query, k=q.top_k)
	if not results:
	return {"answer": "No relevant information found.", "sources": []}
	context = "\n\n---\n\n".join(r["content"] for r in results)
	answer = await generate(q.query, context)
	return {
	"answer": answer,
	"sources": [{"source": r.get("source"), "rrf_score": r.get("rrf_score", 0)}
	for r in results],
	}


	@app.get("/health")
	async def health():
	return JSONResponse(200, {
	"status" : "ready" if state.ready else ("error" if state.error else "loading"),
	"docs" : len(state.documents),
	"device" : DEVICE,
	"faiss" : state.faiss_index is not None,
	"bm25" : state.bm25 is not None,
	"generator" : state.generator is not None,
	"error" : state.error or None,
	})


	if __name__ == "__main__":
	uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)