Spaces:

hamaylza
/

bot

Running

App Files Files Community

bot / app.py

hamaylza

Update app.py

7ce2534 verified about 1 month ago

Raw

History Blame Contribute Delete

6.66 kB

	# =========================
	# MAYLBOT ULTRA - FASTAPI VERSION
	# =========================

	import os, time, sqlite3, hashlib, zipfile, threading
	from pathlib import Path

	import requests
	import chromadb
	from sentence_transformers import SentenceTransformer
	from groq import Groq

	from fastapi import FastAPI, UploadFile, File

	# =========================
	# CONFIG
	# =========================

	DATASET_ZIP = "maylbot_dataset.zip"
	DATASET_DIR = Path("maylbot_dataset")

	CHROMA_DIR = "chroma"
	SQLITE_PATH = "memory.db"

	COLLECTION_NAME = "maylbot"
	EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	GROQ_MODEL = "llama-3.3-70b-versatile"

	GITHUB_USER = "hamaylzahid"
	GITHUB_API = f"https://api.github.com/users/{GITHUB_USER}/repos"

	TOP_K = 5
	SIM_THRESHOLD = 0.85
	CHUNK_SIZE = 120
	CHUNK_OVERLAP = 30

	# =========================
	# APP INIT
	# =========================

	app = FastAPI(title="MAYLBOT API")

	# =========================
	# DATASET
	# =========================

	def setup_dataset():
	if DATASET_DIR.exists():
	return
	if Path(DATASET_ZIP).exists():
	with zipfile.ZipFile(DATASET_ZIP, "r") as z:
	z.extractall(".")
	else:
	print("Dataset missing")

	# =========================
	# CHUNKING
	# =========================

	def chunk_text(text):
	words = text.split()
	chunks = []
	i = 0
	while i < len(words):
	chunks.append(" ".join(words[i:i+CHUNK_SIZE]))
	i += CHUNK_SIZE - CHUNK_OVERLAP
	return chunks

	# =========================
	# EMBEDDINGS + VECTOR DB
	# =========================

	def load_embed():
	return SentenceTransformer(EMBED_MODEL)

	def get_collection():
	client = chromadb.PersistentClient(path=CHROMA_DIR)
	return client.get_or_create_collection(name=COLLECTION_NAME)

	def ingest(col, model):
	files = list(DATASET_DIR.rglob("*.txt"))
	existing = set(col.get()["ids"])

	for f in files:
	text = f.read_text(errors="ignore")
	for i, c in enumerate(chunk_text(text)):
	id_ = hashlib.md5(f"{f}_{i}".encode()).hexdigest()
	if id_ in existing:
	continue
	emb = model.encode(c).tolist()
	col.add(documents=[c], embeddings=[emb], ids=[id_])

	# =========================
	# RETRIEVAL
	# =========================

	def retrieve(q, col, model):
	emb = model.encode(q).tolist()
	res = col.query(
	query_embeddings=[emb],
	n_results=TOP_K,
	include=["documents", "distances"]
	)

	docs = res["documents"][0]
	dists = res["distances"][0]

	return [d for d, dist in zip(docs, dists) if dist < SIM_THRESHOLD] or docs

	# =========================
	# MEMORY (SQLite)
	# =========================

	def init_db():
	conn = sqlite3.connect(SQLITE_PATH, check_same_thread=False)
	conn.execute("CREATE TABLE IF NOT EXISTS chat(role TEXT, content TEXT)")
	conn.commit()
	conn.close()

	def save(role, msg):
	conn = sqlite3.connect(SQLITE_PATH, check_same_thread=False)
	conn.execute("INSERT INTO chat VALUES (?,?)", (role, msg))
	conn.commit()
	conn.close()

	def load():
	conn = sqlite3.connect(SQLITE_PATH, check_same_thread=False)
	rows = conn.execute("SELECT role, content FROM chat").fetchall()
	conn.close()
	return [{"role": r, "content": c} for r, c in rows]

	# =========================
	# GROQ CLIENT
	# =========================

	def get_groq():
	key = os.environ.get("GROQ_API_KEY")

	if not key:
	raise ValueError("Missing GROQ_API_KEY")

	return Groq(api_key=key)

	# =========================
	# GITHUB CACHE
	# =========================

	cache = {"data": None, "time": 0}
	lock = threading.Lock()

	def github():
	with lock:
	if time.time() - cache["time"] < 300:
	return cache["data"]

	try:
	r = requests.get(GITHUB_API)
	data = r.json()
	txt = "\n".join([f"{x['name']} - {x['language']}" for x in data[:10]])

	cache["data"] = txt
	cache["time"] = time.time()
	return txt
	except:
	return ""

	# =========================
	# PROMPT
	# =========================

	def build_prompt(context, history, gh):
	return f"""
	You are MAYLBOT — a high-end AI assistant built by an advanced AI engineer.

	PERSONALITY:
	- Confident, sharp, slightly witty
	- Speaks like a real engineer, not a chatbot
	- No robotic phrasing

	IDENTITY:
	- Hamayl Zahid is a female AI engineer
	- ALWAYS refer to her as she/her

	INTELLIGENCE:
	- Combine reasoning + memory + context
	- Fill small gaps logically
	- Never sound clueless

	When answering analytical questions (like hiring, rating, comparison):

	Return structure:

	1. Evidence Found:
	2. Missing Evidence:
	3. Reasoning:
	4. Final Verdict:
	5. Confidence Level (Low / Medium / High)

	ANALYSIS:
	When evaluating projects:
	- technical depth
	- real-world value
	- innovation

	LIVE GITHUB:
	{gh}

	CONTEXT:
	{chr(10).join(context)}

	MEMORY:
	{history[-6:]}

	RULES:
	- No "I don't know" unless zero signal
	- Be natural, not formal AI tone
	- Keep answers smart and clean
	"""


	# =========================
	# INIT MODELS
	# =========================

	setup_dataset()
	model = load_embed()
	col = get_collection()
	client = get_groq()
	init_db()

	if col.count() == 0:
	ingest(col, model)

	# =========================
	# CHAT ENGINE
	# =========================

	def run_chat(q, history):
	context = retrieve(q, col, model)

	system = build_prompt(context, history, github())

	msgs = [{"role": "system", "content": system}]
	msgs += history[-6:]
	msgs.append({"role": "user", "content": q})

	res = client.chat.completions.create(
	model=GROQ_MODEL,
	messages=msgs,
	temperature=0.5
	)

	reply = res.choices[0].message.content

	save("user", q)
	save("assistant", reply)

	return reply

	# =========================
	# API ENDPOINTS
	# =========================

	from fastapi.responses import FileResponse

	@app.get("/")
	def ui():
	return FileResponse("index.html")

	@app.post("/chat")
	def chat_api(payload: dict):
	q = payload.get("message")
	history = load()

	response = run_chat(q, history)

	return {"response": response}

	@app.post("/voice")
	async def voice_api(file: UploadFile = File(...)):
	audio = await file.read()

	result = client.audio.transcriptions.create(
	model="whisper-large-v3",
	file=("audio.wav", audio),
	response_format="text"
	)

	history = load()
	response = run_chat(result, history)

	return {
	"transcript": result,
	"response": response
	}

	import uvicorn

	if __name__ == "__main__":
	uvicorn.run("app:app", host="0.0.0.0", port=7860)