Anshul Prasad commited on
Commit ·
384c26a
1
Parent(s): 0489d07
feat: Switch from TinyLlama to Phi-3-mini for larger context window
Browse files- Replace TinyLlama 1.1B (2K context) with Phi-3-mini (4K context)
- Allows MAX_CONTEXT_TOKENS up to 10,000 (was limited to 800)
- Phi-3-mini: 2.4GB, faster, better quality
- Use Phi-3 chat format with <|user|> and <|assistant|> tags
- Increase max_tokens from 1000 to 2000 for better answers
- All within HF Spaces constraints (16GB RAM, 2CPU)"
- api/generate_response.py +14 -7
- config.py +1 -1
api/generate_response.py
CHANGED
|
@@ -12,20 +12,21 @@ llm = None
|
|
| 12 |
def load_model_at_startup():
|
| 13 |
global llm
|
| 14 |
try:
|
| 15 |
-
logger.info("Loading model into RAM...")
|
| 16 |
|
| 17 |
llm = Llama.from_pretrained(
|
| 18 |
-
repo_id="
|
| 19 |
-
filename="
|
| 20 |
verbose=False,
|
| 21 |
n_gpu_layers=0, # CPU only (safe for HF Spaces)
|
| 22 |
-
n_ctx=
|
| 23 |
)
|
| 24 |
-
logger.info("
|
| 25 |
|
| 26 |
except Exception as e:
|
| 27 |
logger.error("Failed to load model: %s", e)
|
| 28 |
llm = None
|
|
|
|
| 29 |
def generate_response(query: str, context: str) -> str:
|
| 30 |
|
| 31 |
if llm is None:
|
|
@@ -37,12 +38,18 @@ def generate_response(query: str, context: str) -> str:
|
|
| 37 |
try:
|
| 38 |
answer = llm(
|
| 39 |
f"[SYSTEM]{SYSTEM_PROMPT}[/SYSTEM]\n{prompt}",
|
| 40 |
-
max_tokens=
|
| 41 |
temperature=1.0,
|
| 42 |
top_p=1.0,
|
| 43 |
-
stop=["Question:", "
|
|
|
|
| 44 |
)
|
| 45 |
answer = answer["choices"][0]["text"].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
logging.info('Answer Generation Succeeded.')
|
| 47 |
return answer
|
| 48 |
|
|
|
|
| 12 |
def load_model_at_startup():
|
| 13 |
global llm
|
| 14 |
try:
|
| 15 |
+
logger.info("Loading Phi-3-mini model into RAM...")
|
| 16 |
|
| 17 |
llm = Llama.from_pretrained(
|
| 18 |
+
repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
|
| 19 |
+
filename="Phi-3-mini-4k-instruct-Q4_K_M.gguf",
|
| 20 |
verbose=False,
|
| 21 |
n_gpu_layers=0, # CPU only (safe for HF Spaces)
|
| 22 |
+
n_ctx=4096,
|
| 23 |
)
|
| 24 |
+
logger.info("Phi-3-mini model loaded into RAM successfully.")
|
| 25 |
|
| 26 |
except Exception as e:
|
| 27 |
logger.error("Failed to load model: %s", e)
|
| 28 |
llm = None
|
| 29 |
+
|
| 30 |
def generate_response(query: str, context: str) -> str:
|
| 31 |
|
| 32 |
if llm is None:
|
|
|
|
| 38 |
try:
|
| 39 |
answer = llm(
|
| 40 |
f"[SYSTEM]{SYSTEM_PROMPT}[/SYSTEM]\n{prompt}",
|
| 41 |
+
max_tokens=2000,
|
| 42 |
temperature=1.0,
|
| 43 |
top_p=1.0,
|
| 44 |
+
stop=["<|end|>", "Question:", "<|user|>"],
|
| 45 |
+
echo=False
|
| 46 |
)
|
| 47 |
answer = answer["choices"][0]["text"].strip()
|
| 48 |
+
|
| 49 |
+
if not answer:
|
| 50 |
+
logger.warning("Failed to generate response. Returning empty response.")
|
| 51 |
+
return "I couldn't generate response. Please try again."
|
| 52 |
+
|
| 53 |
logging.info('Answer Generation Succeeded.')
|
| 54 |
return answer
|
| 55 |
|
config.py
CHANGED
|
@@ -15,7 +15,7 @@ RETRIEVED_TRANSCRIPTS_FILE = Path("outputs/retrieved_transcripts.txt")
|
|
| 15 |
RESPONSE_FILE = Path("outputs/generated_response.txt")
|
| 16 |
COOKIES_FILE = Path("utils/youtube_cookies.txt")
|
| 17 |
|
| 18 |
-
MAX_CONTEXT_TOKENS =
|
| 19 |
|
| 20 |
SYSTEM_PROMPT = """
|
| 21 |
You are speaking as Acharya Prashant.
|
|
|
|
| 15 |
RESPONSE_FILE = Path("outputs/generated_response.txt")
|
| 16 |
COOKIES_FILE = Path("utils/youtube_cookies.txt")
|
| 17 |
|
| 18 |
+
MAX_CONTEXT_TOKENS = 10000
|
| 19 |
|
| 20 |
SYSTEM_PROMPT = """
|
| 21 |
You are speaking as Acharya Prashant.
|