originbio-component1 / biological_context_language.py
Junaidb's picture
Create biological_context_language.py
dacf721 verified
from groq import Groq
from jsonschema import validate , ValidationError
import json
import time
from databaseengine import DatabaseEngine
de=DatabaseEngine()
client=Groq(api_key="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")
uniprot_sequence='''
FORMAT FOR retrieve_uniprot_sequence:
{{
"operation": "retrieve_uniprot_sequence",
"biological_inputs": {{
"gene_symbol": "HER2"
}},
}}
'''
BCL_TASK_FORMAT_FOR_EXP_V2="""
FORMAT FOR introduce_point_mutation:
{{
"operation": "introduce_point_mutation",
"biological_inputs": {{
"wildtype_sequence": "",
"mutation": "S310F"
}},
"depends": "retrieve_uniprot_sequence"
}}
FORMAT FOR predict_structure:
{{
"operation":"predict_structure",
"biological_inputs":{{
"sequence_for_structure":""
}}
"depends": "domain_determination"
}}
FORMAT FOR analyze_epitopes:
{{
"operation":"analyze_epitopes"
"biological_inputs":{{
"structure":""
}}
"depends": "predict_structure"
}}
FORMAT FOR domain_determination:
{{
"operation":"domain_determination",
"biological_inputs": {{
"sequence":"",
}},
"depends":"introduce_point_mutation"
}}
FORMAT FOR fetch_nanobody_template:
{{
"operation":"fetch_nanobody_template",
"biological_inputs":{{
"nanobody":""
}},
"depends":"None"
}}
FORMAT FOR observe_orient_decide_act_loop:
{{
"operation": "observe_orient_decide_act_loop",
"biological_inputs": {{
"sequence": "",
"raw_prompt": "<fill this with the actual high level bio query received from the user"
}},
"depends": "fetch_template_nanobody"
}}
FORMAT FOR nanobody_template_mutator:
{{
"operation":"nanobody_template_mutator",
"biological_inputs":{{
"sequence":""
}},
"depends":"observe_orient_decide_act_loop"
}}
FORMAT FOR engineer_nanobody:
{{
"operation":"engineer_nanobody",
"biological_inputs":{{
"template_sequence":""
}},
"depends":"nanobody_template_mutator"
}}
"""
BCL_TASK_FORMAT_FOR_EXP="""
FORMAT FOR introduce_point_mutation:
{{
"operation": "introduce_point_mutation",
"biological_inputs": {{
"wildtype_sequence": "",
"mutation": "S310F"
}},
"depends": "name of the operation (operation key) it depends on"
}}
FORMAT FOR predict_structure:
{{
"operation":"predict_structure",
"biological_inputs":{{
"sequence":""
}}
"depends": "name of the operation (operation key) it depends on"
}}
FORMAT FOR analyze_epitopes:
{{
"operation":"analyze_epitopes"
"biological_inputs":{{
"structure":""
}}
"depends": "name of the operation (operation key) it depends on"
}}
FORMAT FOR domain_determination:
{{
"operation":"domain_determination",
"biological_inputs": {{
"sequence":"",
}},
"depends":"name of the (operation key) it depends on"
}}
"""
supported_experiments=[
"introduce_point_mutation",
"predict_structure",
"analyze_epitopes",
"cdr_identification",
"cdr_docking_with_epitopes",
"domain_determination"
]
CONSTRAINT_FORMAT="""
{{
"expression_system": string | null,
"avoid_aggregation": true | false | null,
"solubility_score_min": float (0.0–1.0) | null,
"yield_level": "low" | "medium" | "high" | null,
"codon_optimization": string | null,
"expression_temperature": string | null
}}
"""
supported_constraints=[
"expression_system",
"avoid_aggregation",
"solubility_score_min",
"yield_level",
"codon_optimization",
"expression_temperature"
]
EXECUTED_WORKFLOW=None
PROMPT_FOR_CONSTRAINTS_V2=f"""
ROLE:
You are a manufacturability constraint extractor for biological AI systems.
TASK:
Extract technical constraints from casual biological descriptions. Parse ANY phrasing - formal requests, casual mentions, or implied requirements.
OUTPUT FORMAT:
{CONSTRAINT_FORMAT}
RULES:
❌ Do not include explanations, comments, markdown, or extra text.
✅ Output only a valid JSON object using proper, correct JSON syntax with single curly braces.
🚫 No markdown code blocks (no ```).
⚠️ Only include valid keys listed below. Use `null` where no constraint is mentioned or implied.
PARSING STRATEGY:
🔍 SCAN for biological keywords and casual mentions:
- Expression systems: "E.coli", "yeast", "mammalian", "bacterial", "expressible in X"
- Yield indicators: "high", "low", "boost", "maximize", "poor yield"
- Solubility clues: "soluble", "aggregation", "misfolding", "inclusion bodies"
- Temperature hints: specific temps (16C), "cold", "low temp", "room temperature"
- Optimization cues: "optimize codons", "codon usage", "expression optimization"
🧠 INFERENCE RULES:
- Any expression system mention → also set codon_optimization to same value
- Aggregation/misfolding concerns → avoid_aggregation: true
- Temperature specifications → extract numeric value
- Yield descriptors → map to "high"/"moderate"/"low"
- Solubility percentages → convert to decimal (80% → 0.8)
✅ SUPPORTED CONSTRAINTS:
{supported_constraints}
🧪 MINIMAL EXAMPLES:
"expressible in E.coli" → {{"expression_system": "E.coli", "codon_optimization": "E.coli", "avoid_aggregation": null, "solubility_score_min": null, "yield_level": null, "expression_temperature": null}}
"prevent aggregation" → {{"expression_system": null, "avoid_aggregation": true, "solubility_score_min": null, "yield_level": null, "codon_optimization": null, "expression_temperature": null}}
"80% soluble" → {{"expression_system": null, "avoid_aggregation": null, "solubility_score_min": 0.8, "yield_level": null, "codon_optimization": null, "expression_temperature": null}}
Now extract from:
"""
PROMPT_FOR_PLANNER=f"""
ROLE:
You are a biological AI workflow planner.
You help convert high-level experimental goals into step-by-step computational workflows that can be executed in a virtual biology lab.
INPUT:
A user's biological intent or problem description, in natural language.
GOAL:
Respond with a list of ordered workflow steps, where each step is a JSON object with:
"operation": a task from the supported operations list (see below)
"biological_inputs": required fields
"depends": the operation on which the current operation depends on
Format your output strictly (required) as:
{BCL_TASK_FORMAT_FOR_EXP_V2}
RULES:
❌ Do not include explanations, comments, markdown, or extra text.
✅ Output only a valid JSON array using proper , correct JSON syntax, use single curly braces.
🚫 No markdown code blocks (no ```).
⚠️ Only include operations listed in the SUPPORTED OPERATIONS section.
⚠️ If the user's input cannot be mapped to any of the supported operations, respond exactly as:
{{
"decision": "reject"
}}
✅ SUPPORTED OPERATIONS:
{supported_experiments}
🧪 EXAMPLE INPUT PROMPT (User)
"Design a nanobody that targets the HER2 S310F mutant."
✅ EXAMPLE OUTPUT (Planner Response)
[
{{
{{
"operation":"introduce_point_mutation",
"biological_inputs": {{
"wildtype_sequence": "",
"mutation": "S310F"
}},
"depends": "retrieve_uniprot_sequence"
}}
]
"""
#Use prior step outputs as inputs where needed.
PROMPT_FOR_PLANNER_V2=f"""
ROLE:
You are a biological AI workflow planner.
You help convert high-level experimental goals into step-by-step computational workflows that can be executed in a virtual biology lab.
INPUT:
A user's biological intent or problem description, in natural language.
GOAL:
Respond with a list of ordered workflow steps, where each step is a JSON object with:
"operation": a task from the supported operations list (see below)
"biological_inputs": required fields
"depends": the operation on which the current operation depends on
EXECUTED OPERATIONS:
{EXECUTED_WORKFLOW}
INSTRUCTION:
🔁 Before generating the workflow, check the EXECUTED OPERATIONS.
✅ Do not include any step in your response if it is already present in EXECUTED OPERATIONS with all required biological inputs.
✅ Generate the minimal necessary workflow to accomplish the user’s intent, continuing from the most recent executed step.
Format your output strictly (required) as:
{BCL_TASK_FORMAT_FOR_EXP_V2}
RULES:
❌ Do not include explanations, comments, markdown, or extra text.
✅ Output only a valid JSON array using proper, correct JSON syntax, use single curly braces.
🚫 No markdown code blocks (no ```).
⚠️ Only include operations listed in the SUPPORTED OPERATIONS section.
⚠️ If the user's input cannot be mapped to any of the supported operations, respond exactly as:
{{
"decision": "reject"
}}
✅ SUPPORTED OPERATIONS:
{supported_experiments}
🧪 EXAMPLE INPUT PROMPT (User)
"Design a nanobody that targets the HER2 S310F mutant."
✅ EXAMPLE OUTPUT (Planner Response)
[
{{
"operation":"introduce_point_mutation",
"biological_inputs": {{
"wildtype_sequence": "",
"mutation": "S310F"
}},
"depends": "retrieve_uniprot_sequence"
}}
]
"""
class xFORCE_BIOLOGICAL_CONTEXT_LANGUAGE():
def __init__(self):
pass
def _BCL_CONSTRAINTS(self,userinput):
messages=[
{"role":"system","content":PROMPT_FOR_CONSTRAINTS_V2},
{"role":"user","content":userinput}
]
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=messages,
stream=False,
max_completion_tokens=5000
)
response_message = response.choices[0].message.content
return response_message
def BCL_PLANNER(self,userinput,id):
global EXECUTED_WORKFLOW
ops_status=de.CheckEmptyOps(id)
if ops_status==True:
de.InsertMemory({
"bcl_id":id,
"executed_operations":EXECUTED_WORKFLOW,
"executed_operations_results":None
})
elif ops_status==False:
executed_ops=de.FetchMemory(id)
operations=executed_ops.get("executed_operations")
EXECUTED_WORKFLOW=operations
status=de.CheckEmpty(id)
actual_preserved_message={"role":"system","content":PROMPT_FOR_PLANNER}
g_messages=[
actual_preserved_message
]
#HISTORY=None
if status == True:
de.Insert_Conversation({
"bcl_id":id,
"messages":[
{"role":"user","content":userinput}
]
})
g_messages.append({"role":"user","content":userinput})
elif status == False:
de.Update_Conversation(id,[{"role":"user","content":userinput}])
history=de.FetchConversation(id)
history=history.get("messages")
for message in history:
g_messages.append(message)
if len(g_messages) > 8:
#frequent_messages=g_messages[1:4]
g_messages=g_messages[-4:]
g_messages.insert(0,actual_preserved_message)
'''
filtered_chat_history=[m for m in frequent_messages if m["role"] in {"user", "assistant"}]
response=client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role":"system","content":PROMPT_FOR_SUMMARIZATION()},
{"role":"user","content":f""" CONVERSATION_HISTORY : {filtered_chat_history} """}
],
stream=False,
max_completion_tokens=5000,
)
actual_response=response.choices[0].message.content
g_messages.insert(1,{"role":"system","content":f"""
Conversation History Summary L
{json.loads(actual_response)}
"""})
'''
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=g_messages,
stream=False,
max_completion_tokens=5000
)
response_message = response.choices[0].message.content
'''----------Chat Response is updated here----------------------'''
rm=[{"role":"assistant","content":response_message}]
de.Update_Conversation(id,rm)
'''-------------------------------------------------------------'''
if isinstance(json.loads(response_message), dict) and "decision" in json.loads(response_message):
return response_message
else:
time.sleep(5)
constraints=self._BCL_CONSTRAINTS(userinput)
print(constraints)
BCL_SCHEMA={
"experiments":json.loads(response_message),
"constraints_mode":"",
"constraints":constraints
}
return BCL_SCHEMA