Spaces:
Running on Zero
Running on Zero
File size: 3,672 Bytes
fb8f98b b2b3494 10a6bde 7a7680a b2b3494 813969f c6ca0c6 10a6bde 42a6233 10a6bde fb8f98b c6ca0c6 42a6233 b2b3494 c6ca0c6 940699f 10a6bde 940699f 10a6bde fb8f98b 1e73d11 b2b3494 10a6bde 940699f 10a6bde 9e08f70 1e73d11 8dd5b3e b2b3494 1e73d11 8dd5b3e b2b3494 1e73d11 10a6bde 9e08f70 10a6bde 1e73d11 b2b3494 10a6bde c6ca0c6 10a6bde 940699f 8dd5b3e 940699f 8dd5b3e b2b3494 940699f b2b3494 940699f 8dd5b3e c6ca0c6 8dd5b3e 940699f 8dd5b3e 940699f 8dd5b3e 940699f b2b3494 8dd5b3e 42a6233 b2b3494 8dd5b3e c6ca0c6 52d0fe4 8dd5b3e b2b3494 8dd5b3e b2b3494 8dd5b3e 10a6bde 940699f 10a6bde 9c56d16 c6ca0c6 b2b3494 95c9bd8 10a6bde 7a7680a d8c2606 7a7680a 3efbe14 7a7680a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | import os
import faiss
import pickle
import gradio as gr
import spaces
import uvicorn
import threading
from fastapi import FastAPI
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
from openai import OpenAI
# ===============================
# CONFIG
# ===============================
HF_TOKEN = os.environ.get("HF_TOKEN")
# OpenAI-compatible Hugging Face client
client = OpenAI(
base_url="https://router.huggingface.co/v1",
api_key=HF_TOKEN
)
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct:fastest"
# ===============================
# LOAD EMBEDDINGS
# ===============================
print("Loading embedding model...")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# ===============================
# LOAD RAG DATA
# ===============================
print("Downloading FAISS index...")
index_path = hf_hub_download(
repo_id="mahmodGendy/startup-llama-model",
filename="faiss.index"
)
index = faiss.read_index(index_path)
print("Downloading documents...")
docs_path = hf_hub_download(
repo_id="mahmodGendy/startup-llama-model",
filename="docs.pkl"
)
documents = pickle.load(open(docs_path, "rb"))
print("RAG system ready.")
# ===============================
# RAG RETRIEVAL
# ===============================
def retrieve_context(query, top_k=5):
query_embedding = embedding_model.encode([query])
D, I = index.search(query_embedding, top_k)
retrieved_docs = [documents[i] for i in I[0]]
return "\n".join(retrieved_docs)
# ===============================
# GPU / Hosted Inference
# ===============================
@spaces.GPU
def ask_llama(user_input):
context = retrieve_context(user_input)
evaluation_keywords = [
"idea", "start", "business",
"startup", "viable", "launch"
]
is_eval = any(w in user_input.lower() for w in evaluation_keywords)
if is_eval:
response_style = """
1. Problem Validation
2. Market Evaluation
3. Risks
4. Improvement Suggestions
"""
else:
response_style = "Respond naturally and conversationally."
system_prompt = f"""
You are a startup validation expert.
Language Rule:
- English → English
- MSA Arabic → MSA Arabic
- Egyptian dialect → Egyptian Arabic
Context:
{context}
{response_style}
"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_input}
]
# Use hosted API: Hugging Face OpenAI-compatible
completion = client.chat.completions.create(
model=MODEL_ID,
messages=messages,
max_tokens=400,
temperature=0.7,
top_p=0.9
)
return completion.choices[0].message.content
# ===============================
# FASTAPI
# ===============================
app = FastAPI()
class Query(BaseModel):
question: str
@app.post("/ask")
def ask(query: Query):
answer = ask_llama(query.question)
return {"answer": answer}
# ===============================
# GRADIO (Required for ZeroGPU)
# ===============================
def gradio_wrapper(question):
return ask_llama(question)
demo = gr.Interface(
fn=gradio_wrapper,
inputs=gr.Textbox(label="Ask your startup question"),
outputs=gr.Textbox(label="Response")
)
# ===============================
# START SERVERS
# ===============================
def run_fastapi():
uvicorn.run(app, host="0.0.0.0", port=8000)
if __name__ == "__main__":
# Start FastAPI in background
threading.Thread(target=run_fastapi).start()
# Start Gradio (required for ZeroGPU detection)
demo.launch(server_name="0.0.0.0", server_port=7860) |