Spaces:

Maga222006
/

MultiagentPersonalAssistant

Sleeping

App Files Files Community

Maga222006 commited on Sep 7, 2025

Commit

5cf48c0

1 Parent(s): c3adf17

MultiagentPersonalAssistant

Browse files

Files changed (7) hide show

.idea/MultiagentPersonalAssistant.iml +1 -1
.idea/misc.xml +1 -1
agent/__pycache__/file_preprocessing.cpython-312.pyc +0 -0
agent/__pycache__/models.cpython-312.pyc +0 -0
agent/file_preprocessing.py +18 -48
agent/models.py +4 -0
app.py +15 -6

.idea/MultiagentPersonalAssistant.iml CHANGED Viewed

@@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="MultiagentPersonalAssistant" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyDocumentationSettings">

 <module type="PYTHON_MODULE" version="4">
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="test" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyDocumentationSettings">

.idea/misc.xml CHANGED Viewed

@@ -3,5 +3,5 @@
   <component name="Black">
     <option name="sdkName" value="MultiagentPersonalAssistant" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="MultiagentPersonalAssistant" project-jdk-type="Python SDK" />
 </project>

   <component name="Black">
     <option name="sdkName" value="MultiagentPersonalAssistant" />
   </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="test" project-jdk-type="Python SDK" />
 </project>

agent/__pycache__/file_preprocessing.cpython-312.pyc CHANGED Viewed

Binary files a/agent/__pycache__/file_preprocessing.cpython-312.pyc and b/agent/__pycache__/file_preprocessing.cpython-312.pyc differ

agent/__pycache__/models.cpython-312.pyc CHANGED Viewed

Binary files a/agent/__pycache__/models.cpython-312.pyc and b/agent/__pycache__/models.cpython-312.pyc differ

agent/file_preprocessing.py CHANGED Viewed

@@ -1,36 +1,25 @@
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from speechbrain.inference.classifiers import EncoderClassifier
 import speech_recognition as sr
 from pydub import AudioSegment
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
 from docx import Document
-from PIL import Image
 import torchaudio
 import mimetypes
 import asyncio
-import torch
-import io
 import os
 load_dotenv()
-MID = "apple/FastVLM-1.5B"
-IMAGE_TOKEN_INDEX = -200
-tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MID,
-    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",
-    trust_remote_code=True,
-)
 language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
 async def preprocess_file(file_name: str):
     mime_type = mimetypes.guess_type(file_name)[0]
     if "image" in mime_type:
-        return await asyncio.to_thread(preprocess_image, file_name)
     elif "video" in mime_type:
         prompt = "Give a detailed description of the video."
     elif "audio" in mime_type:
@@ -79,40 +68,21 @@ def preprocess_audio(file_name: str):
     return text
-def preprocess_image(file_name: str) -> str:
-    """Send an image + instruction to FastVLM and return the model's answer."""
-    # Build chat with placeholder <image>
-    messages = [{"role": "user", "content": f"<image>\nDescribe this image in detail."}]
-    rendered = tok.apply_chat_template(
-        messages, add_generation_prompt=True, tokenize=False
     )
-    pre, post = rendered.split("<image>", 1)
-    # Tokenize text around the image placeholder
-    pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
-    post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
-    # Insert the image token id (-200)
-    img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
-    input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
-    attention_mask = torch.ones_like(input_ids, device=model.device)
-    # Preprocess the image
-    img = Image.open(file_name).convert("RGB")
-    px = model.get_vision_tower().image_processor(images=img, return_tensors="pt")["pixel_values"]
-    px = px.to(model.device, dtype=model.dtype)
-    # Generate response
-    with torch.no_grad():
-        out = model.generate(
-            inputs=input_ids,
-            attention_mask=attention_mask,
-            images=px,
-            max_new_tokens=128,
-        )
-    return tok.decode(out[0], skip_special_tokens=True)
 def preprocess_text(file_name, mime_type: str) -> str:

 from speechbrain.inference.classifiers import EncoderClassifier
+from langchain_core.messages import HumanMessage
+from agent.models import llm_image
 import speech_recognition as sr
 from pydub import AudioSegment
 from dotenv import load_dotenv
 from PyPDF2 import PdfReader
 from docx import Document
 import torchaudio
 import mimetypes
 import asyncio
+import base64
 import os
 load_dotenv()
 language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
 async def preprocess_file(file_name: str):
     mime_type = mimetypes.guess_type(file_name)[0]
     if "image" in mime_type:
+        return await preprocess_image(file_name)
     elif "video" in mime_type:
         prompt = "Give a detailed description of the video."
     elif "audio" in mime_type:
     return text
+async def preprocess_image(file_name: str):
+    with open(file_name, "rb") as f:
+        img_b64 = base64.b64encode(f.read()).decode("utf-8")
+    response = await llm_image.ainvoke([HumanMessage(
+                content=[
+                    {"type": "text", "text": "Please analyze this image and give detailed description."},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
+                    },
+                ]
+            )
+        ]
     )
+    return response.content
 def preprocess_text(file_name, mime_type: str) -> str:

agent/models.py CHANGED Viewed

@@ -13,4 +13,8 @@ llm_peripheral = init_chat_model(
 llm_agents = init_chat_model(
     model="groq:qwen/qwen3-32b"
 )

 llm_agents = init_chat_model(
     model="groq:qwen/qwen3-32b"
+)
+llm_image = init_chat_model(
+    model="groq:meta-llama/llama-4-scout-17b-16e-instruct"
 )

app.py CHANGED Viewed

@@ -29,14 +29,17 @@ async def file_mode(
     data = await file.read()
     dest.write_bytes(data)
     # Preprocess file
     transcription = await preprocess_file(str(dest))
     state_data["message"] = HumanMessage(
         content=transcription
     )
     # Call the agents
-    assistant = Assistant(state=state_data)
-    await assistant.authorization()
     response = await assistant.run()
     os.remove(str(dest))
     return response
@@ -57,6 +60,10 @@ async def file_mode(
     data = await file.read()
     dest.write_bytes(data)
     # Preprocess file
     file_contents = await preprocess_file(str(dest))
     state_data["message"] = HumanMessage(
@@ -64,8 +71,7 @@ async def file_mode(
     )
     # Call the agents
-    assistant = Assistant(state=state_data)
-    await assistant.authorization()
     response = await assistant.run()
     os.remove(str(dest))
     return response
@@ -86,6 +92,10 @@ async def file_mode(
     data = await file.read()
     dest.write_bytes(data)
     # Preprocess file
     file_contents = await preprocess_file(str(dest))
     state_data["message"] = HumanMessage(
@@ -93,8 +103,7 @@ async def file_mode(
     )
     # Call the agents
-    assistant = Assistant(state=state_data)
-    await assistant.authorization()
     response = await assistant.run()
     os.remove(str(dest))
     return response

     data = await file.read()
     dest.write_bytes(data)
+    # Initialize agent
+    assistant = Assistant(state=state_data)
+    await assistant.authorization()
     # Preprocess file
     transcription = await preprocess_file(str(dest))
     state_data["message"] = HumanMessage(
         content=transcription
     )
     # Call the agents
+    assistant.state = state_data
     response = await assistant.run()
     os.remove(str(dest))
     return response
     data = await file.read()
     dest.write_bytes(data)
+    # Initialize agent
+    assistant = Assistant(state=state_data)
+    await assistant.authorization()
     # Preprocess file
     file_contents = await preprocess_file(str(dest))
     state_data["message"] = HumanMessage(
     )
     # Call the agents
+    assistant.state = state_data
     response = await assistant.run()
     os.remove(str(dest))
     return response
     data = await file.read()
     dest.write_bytes(data)
+    #Initialize agent
+    assistant = Assistant(state=state_data)
+    await assistant.authorization()
     # Preprocess file
     file_contents = await preprocess_file(str(dest))
     state_data["message"] = HumanMessage(
     )
     # Call the agents
+    assistant.state = state_data
     response = await assistant.run()
     os.remove(str(dest))
     return response