Spaces:

Chaitu2112
/

digitalassistant

Sleeping

App Files Files Community

digitalassistant / app.py

Chaitu2112

Update app.py

99ff764 verified 3 months ago

raw

history blame contribute delete

7.49 kB

	import os
	import shutil
	from pathlib import Path

	from fastapi import FastAPI, Form
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.staticfiles import StaticFiles
	from fastapi.responses import JSONResponse, FileResponse
	from dotenv import load_dotenv
	from huggingface_hub import HfApi, hf_hub_download

	from langchain_community.document_loaders import PyMuPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain_community.embeddings import HuggingFaceEmbeddings

	from llama_api import ask_ollama

	load_dotenv()

	BASE_DIR = Path(__file__).resolve().parent

	PDF_PATH = BASE_DIR / "src/data"
	DB_DIR = BASE_DIR / "chroma_digital_icfai"
	HF_DATASET = "Chaitu2112/ifhe-assets"

	EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

	PDF_PATH.mkdir(parents=True, exist_ok=True)


	# -----------------------------------------------------------
	# DOWNLOAD PDFs FROM HUGGINGFACE
	# -----------------------------------------------------------
	def download_pdfs(repo_id: str, local_dir: Path):
	api = HfApi()
	try:
	files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
	except Exception as e:
	print(f"⚠️ Cannot list dataset files: {e}")
	return

	pdfs = [f for f in files if f.lower().endswith(".pdf")]

	if not pdfs:
	print("⚠️ No PDFs found in dataset.")
	return

	for f in pdfs:
	local_path = local_dir / os.path.basename(f)
	if local_path.exists() and local_path.stat().st_size > 0:
	continue

	try:
	print(f"📥 Downloading: {f}")
	cached = hf_hub_download(repo_id=repo_id, filename=f, repo_type="dataset")
	shutil.copy(cached, local_path)
	print(f"✅ Saved to {local_path}")
	except Exception as e:
	print(f"⚠️ Download failed for {f}: {e}")


	download_pdfs(HF_DATASET, PDF_PATH)


	# -----------------------------------------------------------
	# BUILD CHROMA VECTOR DB
	# -----------------------------------------------------------
	def build_chroma_db() -> Chroma:
	print("🔹 Building vector DB from PDFs.")

	documents = []
	for filename in os.listdir(PDF_PATH):
	if filename.lower().endswith(".pdf"):
	path = PDF_PATH / filename
	print(f" Loading PDF: {path}")
	loader = PyMuPDFLoader(str(path))
	documents.extend(loader.load())

	if not documents:
	print(" No PDFs found. Creating empty DB.")
	embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
	return Chroma.from_texts(
	texts=["No content available"],
	embedding=embeddings,
	persist_directory=str(DB_DIR),
	)

	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=150,
	separators=["\n\n", "\n", ".", "!", "?", " "],
	)
	docs = splitter.split_documents(documents)
	print(f"🔹 Total chunks after splitting: {len(docs)}")

	embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)

	vectordb = Chroma.from_documents(
	docs,
	embedding=embeddings,
	persist_directory=str(DB_DIR),
	)
	print(" Vector DB built and persisted.")
	return vectordb


	# -----------------------------------------------------------
	# LOAD OR CREATE CHROMA
	# -----------------------------------------------------------
	def load_or_create_chroma() -> Chroma:
	if DB_DIR.exists() and any(DB_DIR.iterdir()):
	print("🔹 Loading existing Chroma DB.")
	embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
	return Chroma(
	embedding_function=embeddings,
	persist_directory=str(DB_DIR),
	)

	print("🔹 No existing DB found; building a new one...")
	return build_chroma_db()


	vectordb = load_or_create_chroma()
	retriever = vectordb.as_retriever(search_kwargs={"k": 6})
	print("✅ Retriever is ready.")


	# -----------------------------------------------------------
	# FASTAPI SETUP
	# -----------------------------------------------------------
	app = FastAPI()

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	DIST_DIR = BASE_DIR / "dist"

	if DIST_DIR.exists():
	assets_dir = DIST_DIR / "assets"
	if assets_dir.exists():
	app.mount("/assets", StaticFiles(directory=str(assets_dir)), name="assets")

	@app.get("/", include_in_schema=False)
	async def serve_index():
	return FileResponse(DIST_DIR / "index.html")
	else:
	print(" dist/ folder not found — frontend not served by backend.")


	# -----------------------------------------------------------
	# MAIN CHAT API
	# -----------------------------------------------------------
	@app.post("/digital_icfai_chat", response_class=JSONResponse)
	async def digital_icfai_chat_post(user_message: str = Form(...)):
	query = user_message.strip()
	print(f" /digital_icfai_chat POST query: {query!r}")

	# -------------------------------------------------------
	# ROBUST RETRIEVER LOGIC
	# -------------------------------------------------------
	try:
	docs = None

	# Preferred method
	if hasattr(retriever, "get_relevant_documents"):
	maybe = retriever.get_relevant_documents(query)
	docs = await maybe if hasattr(maybe, "__await__") else maybe

	# Async alternatives
	elif hasattr(retriever, "aget_relevant_documents"):
	maybe = retriever.aget_relevant_documents(query)
	docs = await maybe if hasattr(maybe, "__await__") else maybe

	elif hasattr(retriever, "get_relevant_documents_async"):
	maybe = retriever.get_relevant_documents_async(query)
	docs = await maybe if hasattr(maybe, "__await__") else maybe

	# Fallback: use Chroma directly
	elif hasattr(vectordb, "similarity_search"):
	maybe = vectordb.similarity_search(query, k=6)
	docs = await maybe if hasattr(maybe, "__await__") else maybe

	else:
	msg = "Retriever does not support document search in this environment."
	print("❌", msg)
	return {"answer": msg}

	print(f" Retrieved {len(docs) if docs else 0} docs")

	except Exception as e:
	print(f"❌ Retriever error: {e}")
	return {"answer": f"Retriever error: {e}"}

	context = "\n\n".join([d.page_content for d in docs]) if docs else ""
	print(f" Context length: {len(context)} chars")

	prompt = f"""
	You are the Digital ICFAI Assistant.

	Use ONLY the context below to answer.
	If the context does not contain the answer, say so politely.
	Give a detailed, clear, student-friendly explanation with 4–6 lines. Add examples wherever helpful.

	Context:
	{context}

	Question:
	{query}

	Answer (clear, student-friendly):
	"""

	try:
	answer = ask_ollama(prompt)
	except Exception as e:
	print(f"❌ LLM error: {e}")
	answer = f"LLM error: {e}"

	return {"answer": answer}


	# -----------------------------------------------------------
	# DEV SERVER
	# -----------------------------------------------------------
	if __name__ == "__main__":
	import uvicorn
	port = int(os.environ.get("PORT", 7860))
	print(f"🚀 Starting IBS Assistant at http://0.0.0.0:{port}")
	uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)