alaselababatunde commited on
Commit
5e51aba
·
1 Parent(s): ee2e099
.~lock.SME_Builder_Dataset.csv# ADDED
@@ -0,0 +1 @@
 
 
1
+ ,alash-studios,alash-studios-HP-EliteBook-840-G3,19.09.2025 18:30,file:///home/alash-studios/.config/libreoffice/4;
main.py CHANGED
@@ -8,6 +8,7 @@ from langchain.prompts import PromptTemplate
8
  from langchain_huggingface import HuggingFaceEndpoint
9
  from langdetect import detect, DetectorFactory
10
  from huggingface_hub.utils import HfHubHTTPError # for quota error handling
 
11
 
12
  # ----------------- CONFIG -----------------
13
  DetectorFactory.seed = 0
@@ -26,15 +27,14 @@ spitch_client = Spitch()
26
 
27
  # HuggingFace LLM (better tuned for code generation)
28
  llm = HuggingFaceEndpoint(
29
- repo_id="deepseek-ai/deepseek-coder-1.3b-instruct",
30
  temperature=0.7,
31
  top_p=0.9,
32
  do_sample=True,
33
  repetition_penalty=1.1,
34
- max_new_tokens=1024
35
  )
36
 
37
-
38
  # FastAPI app
39
  app = FastAPI(title="DevAssist AI Backend (FastAPI + LangChain)")
40
 
@@ -93,6 +93,7 @@ Guidelines:
93
  - Return **only valid JSON** with keys: "files" → { "index.html": "...", "styles.css": "...", "script.js": "..." }
94
 
95
  Prompt: {user_prompt}
 
96
 
97
  Output:
98
  """
@@ -101,7 +102,7 @@ Output:
101
  chat_chain = PromptTemplate(input_variables=["question"], template=chat_template) | llm
102
  stt_chain = PromptTemplate(input_variables=["speech"], template=stt_chat_template) | llm
103
  autodoc_chain = PromptTemplate(input_variables=["code"], template=autodoc_template) | llm
104
- sme_chain = PromptTemplate(input_variables=["user_prompt"], template=sme_template) | llm
105
 
106
  # ----------------- REQUEST MODELS -----------------
107
  class ChatRequest(BaseModel):
@@ -187,7 +188,11 @@ def autodoc(req: AutoDocRequest, authorization: str | None = Header(None)):
187
  @app.post("/sme/generate")
188
  async def sme_generate(payload: dict = Body(...)):
189
  try:
190
- response = sme_chain.invoke({"user_prompt": payload.get("user_prompt", "")})
 
 
 
 
191
  return {"success": True, "data": response}
192
  except HfHubHTTPError as e:
193
  if "exceeded" in str(e).lower() or "quota" in str(e).lower():
@@ -228,7 +233,11 @@ async def sme_speech_generate(file: UploadFile = File(...), lang_hint: str | Non
228
  translation = transcription
229
 
230
  try:
231
- sme_response = sme_chain.invoke({"user_prompt": translation})
 
 
 
 
232
  return {
233
  "success": True,
234
  "transcription": transcription,
 
8
  from langchain_huggingface import HuggingFaceEndpoint
9
  from langdetect import detect, DetectorFactory
10
  from huggingface_hub.utils import HfHubHTTPError # for quota error handling
11
+ from smebuilder_vector import retriever # <-- your retriever
12
 
13
  # ----------------- CONFIG -----------------
14
  DetectorFactory.seed = 0
 
27
 
28
  # HuggingFace LLM (better tuned for code generation)
29
  llm = HuggingFaceEndpoint(
30
+ repo_id=HF_MODEL,
31
  temperature=0.7,
32
  top_p=0.9,
33
  do_sample=True,
34
  repetition_penalty=1.1,
35
+ max_new_tokens=2048 # bumped tokens
36
  )
37
 
 
38
  # FastAPI app
39
  app = FastAPI(title="DevAssist AI Backend (FastAPI + LangChain)")
40
 
 
93
  - Return **only valid JSON** with keys: "files" → { "index.html": "...", "styles.css": "...", "script.js": "..." }
94
 
95
  Prompt: {user_prompt}
96
+ Context: {context}
97
 
98
  Output:
99
  """
 
102
  chat_chain = PromptTemplate(input_variables=["question"], template=chat_template) | llm
103
  stt_chain = PromptTemplate(input_variables=["speech"], template=stt_chat_template) | llm
104
  autodoc_chain = PromptTemplate(input_variables=["code"], template=autodoc_template) | llm
105
+ sme_chain = PromptTemplate(input_variables=["user_prompt", "context"], template=sme_template) | llm
106
 
107
  # ----------------- REQUEST MODELS -----------------
108
  class ChatRequest(BaseModel):
 
188
  @app.post("/sme/generate")
189
  async def sme_generate(payload: dict = Body(...)):
190
  try:
191
+ user_prompt = payload.get("user_prompt", "")
192
+ # retrieve context
193
+ context_docs = retriever.get_relevant_documents(user_prompt)
194
+ context = "\n".join([doc.page_content for doc in context_docs]) if context_docs else "No extra context"
195
+ response = sme_chain.invoke({"user_prompt": user_prompt, "context": context})
196
  return {"success": True, "data": response}
197
  except HfHubHTTPError as e:
198
  if "exceeded" in str(e).lower() or "quota" in str(e).lower():
 
233
  translation = transcription
234
 
235
  try:
236
+ # vector retrieval here too
237
+ context_docs = retriever.get_relevant_documents(translation)
238
+ context = "\n".join([doc.page_content for doc in context_docs]) if context_docs else "No extra context"
239
+
240
+ sme_response = sme_chain.invoke({"user_prompt": translation, "context": context})
241
  return {
242
  "success": True,
243
  "transcription": transcription,
sme_builder_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
smebuilder_vector.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # smebuilder_vector.py
2
+
3
+ import os
4
+ import pandas as pd
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_chroma import Chroma
7
+ from langchain_core.documents import Document
8
+
9
+ # ----------------- CONFIG -----------------
10
+ DATASET_PATH = "sme_builder_dataset.csv"
11
+ DB_LOCATION = "./Dev_Assist_SME_Builder_DB"
12
+ COLLECTION_NAME = "landing_page_generation_examples"
13
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
14
+
15
+ # ----------------- LOAD DATASET -----------------
16
+ if not os.path.exists(DATASET_PATH):
17
+ raise FileNotFoundError(f"Dataset file not found: {DATASET_PATH}")
18
+
19
+ df = pd.read_csv(DATASET_PATH)
20
+
21
+ # ----------------- EMBEDDINGS -----------------
22
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
23
+
24
+ # Check if vector store exists
25
+ add_documents = not os.path.exists(DB_LOCATION)
26
+
27
+ # ----------------- CREATE DOCUMENTS -----------------
28
+ documents, ids = [], []
29
+ if add_documents:
30
+ for i, row in df.iterrows():
31
+ prompt = row.get("prompt", "")
32
+ html_code = row.get("html_code", "")
33
+ css_code = row.get("css_code", "")
34
+ js_code = row.get("js_code", "")
35
+ sector = row.get("sector", "")
36
+
37
+ page_content = " ".join(
38
+ [str(prompt), str(html_code), str(css_code), str(js_code), str(sector)]
39
+ ).strip()
40
+
41
+ documents.append(Document(page_content=page_content, id=str(i)))
42
+ ids.append(str(i))
43
+
44
+ # ----------------- VECTOR STORE -----------------
45
+ vector_store = Chroma(
46
+ collection_name=COLLECTION_NAME,
47
+ persist_directory=DB_LOCATION,
48
+ embedding_function=embeddings,
49
+ )
50
+
51
+ if add_documents and documents:
52
+ vector_store.add_documents(documents=documents, ids=ids)
53
+
54
+ # ----------------- RETRIEVER -----------------
55
+ retriever = vector_store.as_retriever(search_kwargs={"k": 20})