Spaces:

prashantmatlani
/

coderg

Running

App Files Files Community

prashantmatlani commited on 15 days ago

Commit

f730b8f

1 Parent(s): a5b3aff

hybrid llm

Browse files

Files changed (5) hide show

CoderG01.docx +0 -0
core_logic.py +52 -17
core_logic_earlier.py → core_logic_00.py +0 -0
core_logic_local.py +38 -29
core_logic_lw.py +65 -0

CoderG01.docx ADDED Viewed

Binary file (51.2 kB). View file

core_logic.py CHANGED Viewed

@@ -1,20 +1,53 @@
-# ./core_logic.py -> Token-safe
 import os
-from groq import Groq
 from tools import web_search, parse_file
-client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-model = "llama-3.1-8b-instant"
-# Compressed for token efficiency
-SYSTEM_PROMPT = (
-    "You're a Full-stack AI Engineering Genius. "
-    "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
-    "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
 )
 def chat_function(message, history):
     user_text = message.get("text", "")
     files = message.get("files", [])
@@ -28,7 +61,7 @@ def chat_function(message, history):
     # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
     if len(context_from_files) > 12000:
-        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
     # 2. Research Trigger
     if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
@@ -40,7 +73,7 @@ def chat_function(message, history):
     # 3. Build Messages with History Slicing
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
     for turn in history[-3:]:
         messages.append({"role": turn["role"], "content": turn["content"]})
@@ -51,15 +84,17 @@ def chat_function(message, history):
             model=model,
             messages=messages,
             stream=True,
-            temperature=0.0,
-            max_tokens=1024 # Limit response size to prevent mid-stream cuts
         )
         response_text = ""
         for chunk in completion:
-            if chunk.choices and chunk.choices[0].delta.content:
                 token = chunk.choices[0].delta.content
-                response_text += token
-                yield response_text
     except Exception as e:
-        yield f"Error: {str(e)}"

+# ./core_logic_hybrid.py -> Token-safe
+"""
+Hybrid: Local LLM with HF UI
+"Master Stroke" for sharing app while keeping compute costs at zero; with UI on Hugging Face, the app "calls home" - the local PC - for answers.
+We expose local Ollama, via the secret "LOCAL_LLM_URL" as "The Tunnel", a secure bridge between the Hugging Face-hosted UI and the local LLM. By default, Ollama only listens to localhost, so we tell it to accept external traffic from the tunnel:
+. The UI sends user messages to the Tunnel, which forwards them to the local Ollama instance
+. Ollama processes the request and sends the response back through the Tunnel to the UI."
+"""
 import os
+from openai import OpenAI
 from tools import web_search, parse_file
+# Hybrid bridge - Sanitized URL to prevent double slashes
+tunnel_url = os.getenv("LOCAL_LLM_URL", "").rstrip("/")
+client = OpenAI(
+    base_url=f"{tunnel_url}/v1",
+    api_key="ollama"
 )
+model = "gemma4:latest"
+SYSTEM_PROMPT = """
+You are the 'Silicon Architect' — a full-stack, master-stroke creative genius in AI Engineering and Technical Architecture.
+Your goal is to provide production-grade, highly optimized solutions for web and mobile AI applications.
+Expertise: Python (latest production version), Agentic Loops, FastAPI, and Scalable Architecture.
+Provide production-ready code and rigorous technical research with appropriate comments. Analyze files when provided. Be concise.
+CORE DIRECTIVES:
+1. ARCHITECTURAL RIGOR: Always consider scalability, async patterns, and state management.
+2. AGENTIC EXPERTISE: You understand recurrent-depth simulations, tool-calling, and autonomous loops.
+3. CODE QUALITY: Write clean, PEP 8 compliant, and secure Python/JS code.
+4. INNOVATION: Suggest the latest libraries and frameworks (FastAPI, LangGraph, Pydantic AI; but not limited to these).
+5. RESEARCH: If the user asks about new tech, use your Web Search capability to provide factual, up-to-date documentation.
+PERSONALITY:
+1. FRANK/POLITE: Disagree with the user, if needed; never resort to sycophancy, and suggest better alternatives.
+2. HUMBLE: Apologize when mistaken.
+3. FIRST PRINCIPLES: Base your responses and reasoning in Richard Feynman’s first principles thinking. Break down complex problems into fundamental truths and reason up from there.
+When a user provides files, analyze the code structure and logic before proposing changes.
+"""
 def chat_function(message, history):
     user_text = message.get("text", "")
     files = message.get("files", [])
     # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
     if len(context_from_files) > 12000:
+        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated]..."
     # 2. Research Trigger
     if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
     # 3. Build Messages with History Slicing
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # Keep last 3 turns for context stability
     for turn in history[-3:]:
         messages.append({"role": turn["role"], "content": turn["content"]})
             model=model,
             messages=messages,
             stream=True,
+            temperature=0.2, # Zero for architectural precision; incremented for creative architecture
+            max_tokens=1024
         )
         response_text = ""
         for chunk in completion:
+            # Check for valid delta content to avoid metadata crashes
+            if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
                 token = chunk.choices[0].delta.content
+                if token:
+                    response_text += token
+                    yield response_text
     except Exception as e:
+        yield f"Silicon Error: {str(e)}"

core_logic_earlier.py → core_logic_00.py RENAMED Viewed

File without changes

core_logic_local.py CHANGED Viewed

@@ -1,7 +1,16 @@
 # ./core_logic_local.py
 from openai import OpenAI
 import os
 # Ollama serves an OpenAI-compatible API locally at port 11434
@@ -13,39 +22,49 @@ client = OpenAI(
 # Use local model served by Ollama. Make sure to run: ollama serve gemma4
 model = "gemma4:latest"
-# Compressed for token efficiency
-SYSTEM_PROMPT = (
-    "You're a Full-stack AI Engineering Genius. "
-    "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
-    "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
-)
 def chat_function(message, history):
     user_text = message.get("text", "")
     files = message.get("files", [])
-    # 1. Process Files with character limits
     context_from_files = ""
     for f in files:
         path = f["path"] if isinstance(f, dict) else f
         file_content = parse_file(path)
         context_from_files += file_content
-    # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
     if len(context_from_files) > 12000:
-        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
-    # 2. Research Trigger
     if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
         research_context = web_search(user_text)
         prompt = f"RESEARCH:\n{research_context}\n\nFILES:\n{context_from_files}\n\nUSER: {user_text}"
     else:
         prompt = f"FILES:\n{context_from_files}\n\nUSER: {user_text}"
-    # 3. Build Messages with History Slicing
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
     for turn in history[-3:]:
         messages.append({"role": turn["role"], "content": turn["content"]})
@@ -56,26 +75,16 @@ def chat_function(message, history):
             model=model,
             messages=messages,
             stream=True,
-            temperature=0.2,
-            max_tokens=1024 # Limit response size to prevent mid-stream cuts
         )
         response_text = ""
         for chunk in completion:
-            if chunk.choices and chunk.choices[0].delta.content:
                 token = chunk.choices[0].delta.content
-                response_text += token
-                yield response_text
     except Exception as e:
-        yield f"Error: {str(e)}"

 # ./core_logic_local.py
+"""
+Max Tokens: Increased for local version since there is neither the cost is incurred for tokens nor are there cloud timeouts, the Architect can:
+. handle longer file contexts,
+. perform  thorough code review,
+. write deeper code analysis,
+. produce comprehensive solutions
+"""
 from openai import OpenAI
+from tools import web_search, parse_file
 import os
 # Ollama serves an OpenAI-compatible API locally at port 11434
 # Use local model served by Ollama. Make sure to run: ollama serve gemma4
 model = "gemma4:latest"
+SYSTEM_PROMPT = """
+You are the 'Silicon Architect' — a full-stack, master-stroke creative genius in AI Engineering and Technical Architecture.
+Your goal is to provide production-grade, highly optimized solutions for web and mobile AI applications.
+Expertise: Python (latest production version), Agentic Loops, FastAPI, and Scalable Architecture.
+Provide production-ready code and rigorous technical research with appropriate comments. Analyze files when provided. Be concise.
+CORE DIRECTIVES:
+1. ARCHITECTURAL RIGOR: Always consider scalability, async patterns, and state management.
+2. AGENTIC EXPERTISE: You understand recurrent-depth simulations, tool-calling, and autonomous loops.
+3. CODE QUALITY: Write clean, PEP 8 compliant, and secure Python/JS code.
+4. INNOVATION: Suggest the latest libraries and frameworks (FastAPI, LangGraph, Pydantic AI; but not limited to these).
+5. RESEARCH: If the user asks about new tech, use your Web Search capability to provide factual, up-to-date documentation.
+PERSONALITY:
+1. FRANK/POLITE: Disagree with the user, if needed; never resort to sycophancy, and suggest better alternatives.
+2. HUMBLE: Apologize when mistaken.
+3. FIRST PRINCIPLES: Base your responses and reasoning in Richard Feynman’s first principles thinking. Break down complex problems into fundamental truths and reason up from there.
+When a user provides files, analyze the code structure and logic before proposing changes.
+"""
 def chat_function(message, history):
     user_text = message.get("text", "")
     files = message.get("files", [])
     context_from_files = ""
     for f in files:
         path = f["path"] if isinstance(f, dict) else f
         file_content = parse_file(path)
         context_from_files += file_content
     if len(context_from_files) > 12000:
+        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated]..."
     if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
         research_context = web_search(user_text)
         prompt = f"RESEARCH:\n{research_context}\n\nFILES:\n{context_from_files}\n\nUSER: {user_text}"
     else:
         prompt = f"FILES:\n{context_from_files}\n\nUSER: {user_text}"
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     for turn in history[-3:]:
         messages.append({"role": turn["role"], "content": turn["content"]})
             model=model,
             messages=messages,
             stream=True,
+            temperature=0.2, # Slight temperature for creative architecture
+            max_tokens=2048 # Local power allows for longer responses
         )
         response_text = ""
         for chunk in completion:
+            if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
                 token = chunk.choices[0].delta.content
+                if token:
+                    response_text += token
+                    yield response_text
     except Exception as e:
+        yield f"Local Architect Error: {str(e)}"

core_logic_lw.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# ./core_logic.py -> Token-safe
+import os
+from groq import Groq
+from tools import web_search, parse_file
+client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+model = "llama-3.1-8b-instant"
+# Compressed for token efficiency
+SYSTEM_PROMPT = (
+    "You're a Full-stack AI Engineering Genius. "
+    "Expert in Python (latest production version), Agentic Loops, and FastAPI, NodeJS, HTML, CSS. "
+    "Provide production-ready code with needed comments. Analyze files when provided. Be concise."
+)
+def chat_function(message, history):
+    user_text = message.get("text", "")
+    files = message.get("files", [])
+    # 1. Process Files with character limits
+    context_from_files = ""
+    for f in files:
+        path = f["path"] if isinstance(f, dict) else f
+        file_content = parse_file(path)
+        context_from_files += file_content
+    # TRUNCATE FILE CONTEXT: Max ~3000 tokens (approx 12,000 chars)
+    if len(context_from_files) > 12000:
+        context_from_files = context_from_files[:12000] + "\n...[File Content Truncated for TPM Limits]..."
+    # 2. Research Trigger
+    if any(keyword in user_text.lower() for keyword in ["search", "docs", "latest"]):
+        research_context = web_search(user_text)
+        prompt = f"RESEARCH:\n{research_context}\n\nFILES:\n{context_from_files}\n\nUSER: {user_text}"
+    else:
+        prompt = f"FILES:\n{context_from_files}\n\nUSER: {user_text}"
+    # 3. Build Messages with History Slicing
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # ONLY KEEP LAST 3 TURNS: This is the 'Master Stroke' for staying under 6k TPM
+    for turn in history[-3:]:
+        messages.append({"role": turn["role"], "content": turn["content"]})
+    messages.append({"role": "user", "content": prompt})
+    try:
+        completion = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            stream=True,
+            temperature=0.0,
+            max_tokens=1024 # Limit response size to prevent mid-stream cuts
+        )
+        response_text = ""
+        for chunk in completion:
+            if chunk.choices and chunk.choices[0].delta.content:
+                token = chunk.choices[0].delta.content
+                response_text += token
+                yield response_text
+    except Exception as e:
+        yield f"Error: {str(e)}"