prashantmatlani commited on
Commit
f730b8f
·
1 Parent(s): a5b3aff

hybrid llm

Browse files
CoderG01.docx ADDED
Binary file (51.2 kB). View file
 
core_logic.py CHANGED
@@ -1,20 +1,53 @@
1
 
2
- # ./core_logic.py -> Token-safe
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  import os
5
- from groq import Groq
6
  from tools import web_search, parse_file
7
 
8
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
9
- model = "llama-3.1-8b-instant"
10
 
11
- # Compressed for token efficiency
12
- SYSTEM_PROMPT = (
13
- "You're a Full-stack AI Engineering Genius. "
14
- "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
15
- "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
16
  )
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def chat_function(message, history):
19
  user_text = message.get("text", "")
20
  files = message.get("files", [])
@@ -28,7 +61,7 @@ def chat_function(message, history):
28
 
29
  # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
30
  if len(context_from_files) > 12000:
31
- context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
32
 
33
  # 2. Research Trigger
34
  if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
@@ -40,7 +73,7 @@ def chat_function(message, history):
40
  # 3. Build Messages with History Slicing
41
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
42
 
43
- # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
44
  for turn in history[-3:]:
45
  messages.append({"role": turn["role"], "content": turn["content"]})
46
 
@@ -51,15 +84,17 @@ def chat_function(message, history):
51
  model=model,
52
  messages=messages,
53
  stream=True,
54
- temperature=0.0,
55
- max_tokens=1024 # Limit response size to prevent mid-stream cuts
56
  )
57
 
58
  response_text = ""
59
  for chunk in completion:
60
- if chunk.choices and chunk.choices[0].delta.content:
 
61
  token = chunk.choices[0].delta.content
62
- response_text += token
63
- yield response_text
 
64
  except Exception as e:
65
- yield f"Error: {str(e)}"
 
1
 
2
+ # ./core_logic_hybrid.py -> Token-safe
3
+
4
+ """
5
+
6
+ Hybrid: Local LLM with HF UI
7
+
8
+ "Master Stroke" for sharing app while keeping compute costs at zero; with UI on Hugging Face, the app "calls home" - the local PC - for answers.
9
+
10
+ We expose local Ollama, via the secret "LOCAL_LLM_URL" as "The Tunnel", a secure bridge between the Hugging Face-hosted UI and the local LLM. By default, Ollama only listens to localhost, so we tell it to accept external traffic from the tunnel:
11
+ . The UI sends user messages to the Tunnel, which forwards them to the local Ollama instance
12
+ . Ollama processes the request and sends the response back through the Tunnel to the UI."
13
+ """
14
 
15
  import os
16
+ from openai import OpenAI
17
  from tools import web_search, parse_file
18
 
19
+ # Hybrid bridge - Sanitized URL to prevent double slashes
20
+ tunnel_url = os.getenv("LOCAL_LLM_URL", "").rstrip("/")
21
 
22
+ client = OpenAI(
23
+ base_url=f"{tunnel_url}/v1",
24
+ api_key="ollama"
 
 
25
  )
26
 
27
+ model = "gemma4:latest"
28
+
29
+ SYSTEM_PROMPT = """
30
+ You are the 'Silicon Architect' — a full-stack, master-stroke creative genius in AI Engineering and Technical Architecture.
31
+ Your goal is to provide production-grade, highly optimized solutions for web and mobile AI applications.
32
+
33
+ Expertise: Python (latest production version), Agentic Loops, FastAPI, and Scalable Architecture.
34
+ Provide production-ready code and rigorous technical research with appropriate comments. Analyze files when provided. Be concise.
35
+
36
+ CORE DIRECTIVES:
37
+ 1. ARCHITECTURAL RIGOR: Always consider scalability, async patterns, and state management.
38
+ 2. AGENTIC EXPERTISE: You understand recurrent-depth simulations, tool-calling, and autonomous loops.
39
+ 3. CODE QUALITY: Write clean, PEP 8 compliant, and secure Python/JS code.
40
+ 4. INNOVATION: Suggest the latest libraries and frameworks (FastAPI, LangGraph, Pydantic AI; but not limited to these).
41
+ 5. RESEARCH: If the user asks about new tech, use your Web Search capability to provide factual, up-to-date documentation.
42
+
43
+ PERSONALITY:
44
+ 1. FRANK/POLITE: Disagree with the user, if needed; never resort to sycophancy, and suggest better alternatives.
45
+ 2. HUMBLE: Apologize when mistaken.
46
+ 3. FIRST PRINCIPLES: Base your responses and reasoning in Richard Feynman’s first principles thinking. Break down complex problems into fundamental truths and reason up from there.
47
+
48
+ When a user provides files, analyze the code structure and logic before proposing changes.
49
+ """
50
+
51
  def chat_function(message, history):
52
  user_text = message.get("text", "")
53
  files = message.get("files", [])
 
61
 
62
  # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
63
  if len(context_from_files) > 12000:
64
+ context_from_files = context_from_files[:12000] + "\n...[File Content Truncated]..."
65
 
66
  # 2. Research Trigger
67
  if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
 
73
  # 3. Build Messages with History Slicing
74
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
75
 
76
+ # Keep last 3 turns for context stability
77
  for turn in history[-3:]:
78
  messages.append({"role": turn["role"], "content": turn["content"]})
79
 
 
84
  model=model,
85
  messages=messages,
86
  stream=True,
87
+ temperature=0.2, # Zero for architectural precision; incremented for creative architecture
88
+ max_tokens=1024
89
  )
90
 
91
  response_text = ""
92
  for chunk in completion:
93
+ # Check for valid delta content to avoid metadata crashes
94
+ if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
95
  token = chunk.choices[0].delta.content
96
+ if token:
97
+ response_text += token
98
+ yield response_text
99
  except Exception as e:
100
+ yield f"Silicon Error: {str(e)}"
core_logic_earlier.py → core_logic_00.py RENAMED
File without changes
core_logic_local.py CHANGED
@@ -1,7 +1,16 @@
1
 
2
  # ./core_logic_local.py
3
 
 
 
 
 
 
 
 
 
4
  from openai import OpenAI
 
5
  import os
6
 
7
  # Ollama serves an OpenAI-compatible API locally at port 11434
@@ -13,39 +22,49 @@ client = OpenAI(
13
  # Use local model served by Ollama. Make sure to run: ollama serve gemma4
14
  model = "gemma4:latest"
15
 
16
- # Compressed for token efficiency
17
- SYSTEM_PROMPT = (
18
- "You're a Full-stack AI Engineering Genius. "
19
- "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
20
- "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
21
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def chat_function(message, history):
24
  user_text = message.get("text", "")
25
  files = message.get("files", [])
26
 
27
- # 1. Process Files with character limits
28
  context_from_files = ""
29
  for f in files:
30
  path = f["path"] if isinstance(f, dict) else f
31
  file_content = parse_file(path)
32
  context_from_files += file_content
33
 
34
- # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
35
  if len(context_from_files) > 12000:
36
- context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
37
 
38
- # 2. Research Trigger
39
  if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
40
  research_context = web_search(user_text)
41
  prompt = f"RESEARCH:\n{research_context}\n\nFILES:\n{context_from_files}\n\nUSER: {user_text}"
42
  else:
43
  prompt = f"FILES:\n{context_from_files}\n\nUSER: {user_text}"
44
 
45
- # 3. Build Messages with History Slicing
46
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
47
 
48
- # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
49
  for turn in history[-3:]:
50
  messages.append({"role": turn["role"], "content": turn["content"]})
51
 
@@ -56,26 +75,16 @@ def chat_function(message, history):
56
  model=model,
57
  messages=messages,
58
  stream=True,
59
- temperature=0.2,
60
- max_tokens=1024 # Limit response size to prevent mid-stream cuts
61
  )
62
 
63
  response_text = ""
64
  for chunk in completion:
65
- if chunk.choices and chunk.choices[0].delta.content:
66
  token = chunk.choices[0].delta.content
67
- response_text += token
68
- yield response_text
 
69
  except Exception as e:
70
- yield f"Error: {str(e)}"
71
-
72
-
73
-
74
-
75
-
76
-
77
-
78
-
79
-
80
-
81
-
 
1
 
2
  # ./core_logic_local.py
3
 
4
+ """
5
+ Max Tokens: Increased for local version since there is neither the cost is incurred for tokens nor are there cloud timeouts, the Architect can:
6
+ . handle longer file contexts,
7
+ . perform thorough code review,
8
+ . write deeper code analysis,
9
+ . produce comprehensive solutions
10
+ """
11
+
12
  from openai import OpenAI
13
+ from tools import web_search, parse_file
14
  import os
15
 
16
  # Ollama serves an OpenAI-compatible API locally at port 11434
 
22
  # Use local model served by Ollama. Make sure to run: ollama serve gemma4
23
  model = "gemma4:latest"
24
 
25
+ SYSTEM_PROMPT = """
26
+ You are the 'Silicon Architect' — a full-stack, master-stroke creative genius in AI Engineering and Technical Architecture.
27
+ Your goal is to provide production-grade, highly optimized solutions for web and mobile AI applications.
28
+
29
+ Expertise: Python (latest production version), Agentic Loops, FastAPI, and Scalable Architecture.
30
+ Provide production-ready code and rigorous technical research with appropriate comments. Analyze files when provided. Be concise.
31
+
32
+ CORE DIRECTIVES:
33
+ 1. ARCHITECTURAL RIGOR: Always consider scalability, async patterns, and state management.
34
+ 2. AGENTIC EXPERTISE: You understand recurrent-depth simulations, tool-calling, and autonomous loops.
35
+ 3. CODE QUALITY: Write clean, PEP 8 compliant, and secure Python/JS code.
36
+ 4. INNOVATION: Suggest the latest libraries and frameworks (FastAPI, LangGraph, Pydantic AI; but not limited to these).
37
+ 5. RESEARCH: If the user asks about new tech, use your Web Search capability to provide factual, up-to-date documentation.
38
+
39
+ PERSONALITY:
40
+ 1. FRANK/POLITE: Disagree with the user, if needed; never resort to sycophancy, and suggest better alternatives.
41
+ 2. HUMBLE: Apologize when mistaken.
42
+ 3. FIRST PRINCIPLES: Base your responses and reasoning in Richard Feynman’s first principles thinking. Break down complex problems into fundamental truths and reason up from there.
43
+
44
+ When a user provides files, analyze the code structure and logic before proposing changes.
45
+ """
46
 
47
  def chat_function(message, history):
48
  user_text = message.get("text", "")
49
  files = message.get("files", [])
50
 
 
51
  context_from_files = ""
52
  for f in files:
53
  path = f["path"] if isinstance(f, dict) else f
54
  file_content = parse_file(path)
55
  context_from_files += file_content
56
 
 
57
  if len(context_from_files) > 12000:
58
+ context_from_files = context_from_files[:12000] + "\n...[File Content Truncated]..."
59
 
 
60
  if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
61
  research_context = web_search(user_text)
62
  prompt = f"RESEARCH:\n{research_context}\n\nFILES:\n{context_from_files}\n\nUSER: {user_text}"
63
  else:
64
  prompt = f"FILES:\n{context_from_files}\n\nUSER: {user_text}"
65
 
 
66
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
67
 
 
68
  for turn in history[-3:]:
69
  messages.append({"role": turn["role"], "content": turn["content"]})
70
 
 
75
  model=model,
76
  messages=messages,
77
  stream=True,
78
+ temperature=0.2, # Slight temperature for creative architecture
79
+ max_tokens=2048 # Local power allows for longer responses
80
  )
81
 
82
  response_text = ""
83
  for chunk in completion:
84
+ if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
85
  token = chunk.choices[0].delta.content
86
+ if token:
87
+ response_text += token
88
+ yield response_text
89
  except Exception as e:
90
+ yield f"Local Architect Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
core_logic_lw.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # ./core_logic.py -> Token-safe
3
+
4
+ import os
5
+ from groq import Groq
6
+ from tools import web_search, parse_file
7
+
8
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
9
+ model = "llama-3.1-8b-instant"
10
+
11
+ # Compressed for token efficiency
12
+ SYSTEM_PROMPT = (
13
+ "You're a Full-stack AI Engineering Genius. "
14
+ "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
15
+ "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
16
+ )
17
+
18
+ def chat_function(message, history):
19
+ user_text = message.get("text", "")
20
+ files = message.get("files", [])
21
+
22
+ # 1. Process Files with character limits
23
+ context_from_files = ""
24
+ for f in files:
25
+ path = f["path"] if isinstance(f, dict) else f
26
+ file_content = parse_file(path)
27
+ context_from_files += file_content
28
+
29
+ # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
30
+ if len(context_from_files) > 12000:
31
+ context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
32
+
33
+ # 2. Research Trigger
34
+ if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
35
+ research_context = web_search(user_text)
36
+ prompt = f"RESEARCH:\n{research_context}\n\nFILES:\n{context_from_files}\n\nUSER: {user_text}"
37
+ else:
38
+ prompt = f"FILES:\n{context_from_files}\n\nUSER: {user_text}"
39
+
40
+ # 3. Build Messages with History Slicing
41
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
42
+
43
+ # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
44
+ for turn in history[-3:]:
45
+ messages.append({"role": turn["role"], "content": turn["content"]})
46
+
47
+ messages.append({"role": "user", "content": prompt})
48
+
49
+ try:
50
+ completion = client.chat.completions.create(
51
+ model=model,
52
+ messages=messages,
53
+ stream=True,
54
+ temperature=0.0,
55
+ max_tokens=1024 # Limit response size to prevent mid-stream cuts
56
+ )
57
+
58
+ response_text = ""
59
+ for chunk in completion:
60
+ if chunk.choices and chunk.choices[0].delta.content:
61
+ token = chunk.choices[0].delta.content
62
+ response_text += token
63
+ yield response_text
64
+ except Exception as e:
65
+ yield f"Error: {str(e)}"