| |
| |
| |
|
|
| import os, time, sqlite3, hashlib, zipfile, threading |
| from pathlib import Path |
|
|
| import requests |
| import chromadb |
| from sentence_transformers import SentenceTransformer |
| from groq import Groq |
|
|
| from fastapi import FastAPI, UploadFile, File |
|
|
| |
| |
| |
|
|
| DATASET_ZIP = "maylbot_dataset.zip" |
| DATASET_DIR = Path("maylbot_dataset") |
|
|
| CHROMA_DIR = "chroma" |
| SQLITE_PATH = "memory.db" |
|
|
| COLLECTION_NAME = "maylbot" |
| EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2" |
| GROQ_MODEL = "llama-3.3-70b-versatile" |
|
|
| GITHUB_USER = "hamaylzahid" |
| GITHUB_API = f"https://api.github.com/users/{GITHUB_USER}/repos" |
|
|
| TOP_K = 5 |
| SIM_THRESHOLD = 0.85 |
| CHUNK_SIZE = 120 |
| CHUNK_OVERLAP = 30 |
|
|
| |
| |
| |
|
|
| app = FastAPI(title="MAYLBOT API") |
|
|
| |
| |
| |
|
|
| def setup_dataset(): |
| if DATASET_DIR.exists(): |
| return |
| if Path(DATASET_ZIP).exists(): |
| with zipfile.ZipFile(DATASET_ZIP, "r") as z: |
| z.extractall(".") |
| else: |
| print("Dataset missing") |
|
|
| |
| |
| |
|
|
| def chunk_text(text): |
| words = text.split() |
| chunks = [] |
| i = 0 |
| while i < len(words): |
| chunks.append(" ".join(words[i:i+CHUNK_SIZE])) |
| i += CHUNK_SIZE - CHUNK_OVERLAP |
| return chunks |
|
|
| |
| |
| |
|
|
| def load_embed(): |
| return SentenceTransformer(EMBED_MODEL) |
|
|
| def get_collection(): |
| client = chromadb.PersistentClient(path=CHROMA_DIR) |
| return client.get_or_create_collection(name=COLLECTION_NAME) |
|
|
| def ingest(col, model): |
| files = list(DATASET_DIR.rglob("*.txt")) |
| existing = set(col.get()["ids"]) |
|
|
| for f in files: |
| text = f.read_text(errors="ignore") |
| for i, c in enumerate(chunk_text(text)): |
| id_ = hashlib.md5(f"{f}_{i}".encode()).hexdigest() |
| if id_ in existing: |
| continue |
| emb = model.encode(c).tolist() |
| col.add(documents=[c], embeddings=[emb], ids=[id_]) |
|
|
| |
| |
| |
|
|
| def retrieve(q, col, model): |
| emb = model.encode(q).tolist() |
| res = col.query( |
| query_embeddings=[emb], |
| n_results=TOP_K, |
| include=["documents", "distances"] |
| ) |
|
|
| docs = res["documents"][0] |
| dists = res["distances"][0] |
|
|
| return [d for d, dist in zip(docs, dists) if dist < SIM_THRESHOLD] or docs |
|
|
| |
| |
| |
|
|
| def init_db(): |
| conn = sqlite3.connect(SQLITE_PATH, check_same_thread=False) |
| conn.execute("CREATE TABLE IF NOT EXISTS chat(role TEXT, content TEXT)") |
| conn.commit() |
| conn.close() |
|
|
| def save(role, msg): |
| conn = sqlite3.connect(SQLITE_PATH, check_same_thread=False) |
| conn.execute("INSERT INTO chat VALUES (?,?)", (role, msg)) |
| conn.commit() |
| conn.close() |
|
|
| def load(): |
| conn = sqlite3.connect(SQLITE_PATH, check_same_thread=False) |
| rows = conn.execute("SELECT role, content FROM chat").fetchall() |
| conn.close() |
| return [{"role": r, "content": c} for r, c in rows] |
|
|
| |
| |
| |
|
|
| def get_groq(): |
| key = os.environ.get("GROQ_API_KEY") |
|
|
| if not key: |
| raise ValueError("Missing GROQ_API_KEY") |
|
|
| return Groq(api_key=key) |
|
|
| |
| |
| |
|
|
| cache = {"data": None, "time": 0} |
| lock = threading.Lock() |
|
|
| def github(): |
| with lock: |
| if time.time() - cache["time"] < 300: |
| return cache["data"] |
|
|
| try: |
| r = requests.get(GITHUB_API) |
| data = r.json() |
| txt = "\n".join([f"{x['name']} - {x['language']}" for x in data[:10]]) |
|
|
| cache["data"] = txt |
| cache["time"] = time.time() |
| return txt |
| except: |
| return "" |
|
|
| |
| |
| |
|
|
| def build_prompt(context, history, gh): |
| return f""" |
| You are MAYLBOT — a high-end AI assistant built by an advanced AI engineer. |
| |
| PERSONALITY: |
| - Confident, sharp, slightly witty |
| - Speaks like a real engineer, not a chatbot |
| - No robotic phrasing |
| |
| IDENTITY: |
| - Hamayl Zahid is a female AI engineer |
| - ALWAYS refer to her as she/her |
| |
| INTELLIGENCE: |
| - Combine reasoning + memory + context |
| - Fill small gaps logically |
| - Never sound clueless |
| |
| When answering analytical questions (like hiring, rating, comparison): |
| |
| Return structure: |
| |
| 1. Evidence Found: |
| 2. Missing Evidence: |
| 3. Reasoning: |
| 4. Final Verdict: |
| 5. Confidence Level (Low / Medium / High) |
| |
| ANALYSIS: |
| When evaluating projects: |
| - technical depth |
| - real-world value |
| - innovation |
| |
| LIVE GITHUB: |
| {gh} |
| |
| CONTEXT: |
| {chr(10).join(context)} |
| |
| MEMORY: |
| {history[-6:]} |
| |
| RULES: |
| - No "I don't know" unless zero signal |
| - Be natural, not formal AI tone |
| - Keep answers smart and clean |
| """ |
|
|
|
|
| |
| |
| |
|
|
| setup_dataset() |
| model = load_embed() |
| col = get_collection() |
| client = get_groq() |
| init_db() |
|
|
| if col.count() == 0: |
| ingest(col, model) |
|
|
| |
| |
| |
|
|
| def run_chat(q, history): |
| context = retrieve(q, col, model) |
|
|
| system = build_prompt(context, history, github()) |
|
|
| msgs = [{"role": "system", "content": system}] |
| msgs += history[-6:] |
| msgs.append({"role": "user", "content": q}) |
|
|
| res = client.chat.completions.create( |
| model=GROQ_MODEL, |
| messages=msgs, |
| temperature=0.5 |
| ) |
|
|
| reply = res.choices[0].message.content |
|
|
| save("user", q) |
| save("assistant", reply) |
|
|
| return reply |
|
|
| |
| |
| |
|
|
| from fastapi.responses import FileResponse |
|
|
| @app.get("/") |
| def ui(): |
| return FileResponse("index.html") |
|
|
| @app.post("/chat") |
| def chat_api(payload: dict): |
| q = payload.get("message") |
| history = load() |
|
|
| response = run_chat(q, history) |
|
|
| return {"response": response} |
|
|
| @app.post("/voice") |
| async def voice_api(file: UploadFile = File(...)): |
| audio = await file.read() |
|
|
| result = client.audio.transcriptions.create( |
| model="whisper-large-v3", |
| file=("audio.wav", audio), |
| response_format="text" |
| ) |
|
|
| history = load() |
| response = run_chat(result, history) |
|
|
| return { |
| "transcript": result, |
| "response": response |
| } |
|
|
| import uvicorn |
|
|
| if __name__ == "__main__": |
| uvicorn.run("app:app", host="0.0.0.0", port=7860) |