Maga222006 commited on
Commit
5cf48c0
·
1 Parent(s): c3adf17

MultiagentPersonalAssistant

Browse files
.idea/MultiagentPersonalAssistant.iml CHANGED
@@ -2,7 +2,7 @@
2
  <module type="PYTHON_MODULE" version="4">
3
  <component name="NewModuleRootManager">
4
  <content url="file://$MODULE_DIR$" />
5
- <orderEntry type="jdk" jdkName="MultiagentPersonalAssistant" jdkType="Python SDK" />
6
  <orderEntry type="sourceFolder" forTests="false" />
7
  </component>
8
  <component name="PyDocumentationSettings">
 
2
  <module type="PYTHON_MODULE" version="4">
3
  <component name="NewModuleRootManager">
4
  <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="jdk" jdkName="test" jdkType="Python SDK" />
6
  <orderEntry type="sourceFolder" forTests="false" />
7
  </component>
8
  <component name="PyDocumentationSettings">
.idea/misc.xml CHANGED
@@ -3,5 +3,5 @@
3
  <component name="Black">
4
  <option name="sdkName" value="MultiagentPersonalAssistant" />
5
  </component>
6
- <component name="ProjectRootManager" version="2" project-jdk-name="MultiagentPersonalAssistant" project-jdk-type="Python SDK" />
7
  </project>
 
3
  <component name="Black">
4
  <option name="sdkName" value="MultiagentPersonalAssistant" />
5
  </component>
6
+ <component name="ProjectRootManager" version="2" project-jdk-name="test" project-jdk-type="Python SDK" />
7
  </project>
agent/__pycache__/file_preprocessing.cpython-312.pyc CHANGED
Binary files a/agent/__pycache__/file_preprocessing.cpython-312.pyc and b/agent/__pycache__/file_preprocessing.cpython-312.pyc differ
 
agent/__pycache__/models.cpython-312.pyc CHANGED
Binary files a/agent/__pycache__/models.cpython-312.pyc and b/agent/__pycache__/models.cpython-312.pyc differ
 
agent/file_preprocessing.py CHANGED
@@ -1,36 +1,25 @@
1
- from transformers import AutoTokenizer, AutoModelForCausalLM
2
  from speechbrain.inference.classifiers import EncoderClassifier
 
 
3
  import speech_recognition as sr
4
  from pydub import AudioSegment
5
  from dotenv import load_dotenv
6
  from PyPDF2 import PdfReader
7
  from docx import Document
8
- from PIL import Image
9
  import torchaudio
10
  import mimetypes
11
  import asyncio
12
- import torch
13
- import io
14
  import os
15
 
16
  load_dotenv()
17
- MID = "apple/FastVLM-1.5B"
18
- IMAGE_TOKEN_INDEX = -200
19
-
20
- tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
21
- model = AutoModelForCausalLM.from_pretrained(
22
- MID,
23
- dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
24
- device_map="auto",
25
- trust_remote_code=True,
26
- )
27
  language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
28
 
29
 
30
  async def preprocess_file(file_name: str):
31
  mime_type = mimetypes.guess_type(file_name)[0]
32
  if "image" in mime_type:
33
- return await asyncio.to_thread(preprocess_image, file_name)
34
  elif "video" in mime_type:
35
  prompt = "Give a detailed description of the video."
36
  elif "audio" in mime_type:
@@ -79,40 +68,21 @@ def preprocess_audio(file_name: str):
79
  return text
80
 
81
 
82
- def preprocess_image(file_name: str) -> str:
83
- """Send an image + instruction to FastVLM and return the model's answer."""
84
-
85
- # Build chat with placeholder <image>
86
- messages = [{"role": "user", "content": f"<image>\nDescribe this image in detail."}]
87
- rendered = tok.apply_chat_template(
88
- messages, add_generation_prompt=True, tokenize=False
 
 
 
 
 
 
89
  )
90
- pre, post = rendered.split("<image>", 1)
91
-
92
- # Tokenize text around the image placeholder
93
- pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
94
- post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
95
-
96
- # Insert the image token id (-200)
97
- img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
98
- input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
99
- attention_mask = torch.ones_like(input_ids, device=model.device)
100
-
101
- # Preprocess the image
102
- img = Image.open(file_name).convert("RGB")
103
- px = model.get_vision_tower().image_processor(images=img, return_tensors="pt")["pixel_values"]
104
- px = px.to(model.device, dtype=model.dtype)
105
-
106
- # Generate response
107
- with torch.no_grad():
108
- out = model.generate(
109
- inputs=input_ids,
110
- attention_mask=attention_mask,
111
- images=px,
112
- max_new_tokens=128,
113
- )
114
-
115
- return tok.decode(out[0], skip_special_tokens=True)
116
 
117
 
118
  def preprocess_text(file_name, mime_type: str) -> str:
 
 
1
  from speechbrain.inference.classifiers import EncoderClassifier
2
+ from langchain_core.messages import HumanMessage
3
+ from agent.models import llm_image
4
  import speech_recognition as sr
5
  from pydub import AudioSegment
6
  from dotenv import load_dotenv
7
  from PyPDF2 import PdfReader
8
  from docx import Document
 
9
  import torchaudio
10
  import mimetypes
11
  import asyncio
12
+ import base64
 
13
  import os
14
 
15
  load_dotenv()
 
 
 
 
 
 
 
 
 
 
16
  language_id = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
17
 
18
 
19
  async def preprocess_file(file_name: str):
20
  mime_type = mimetypes.guess_type(file_name)[0]
21
  if "image" in mime_type:
22
+ return await preprocess_image(file_name)
23
  elif "video" in mime_type:
24
  prompt = "Give a detailed description of the video."
25
  elif "audio" in mime_type:
 
68
  return text
69
 
70
 
71
+ async def preprocess_image(file_name: str):
72
+ with open(file_name, "rb") as f:
73
+ img_b64 = base64.b64encode(f.read()).decode("utf-8")
74
+ response = await llm_image.ainvoke([HumanMessage(
75
+ content=[
76
+ {"type": "text", "text": "Please analyze this image and give detailed description."},
77
+ {
78
+ "type": "image_url",
79
+ "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}
80
+ },
81
+ ]
82
+ )
83
+ ]
84
  )
85
+ return response.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
 
88
  def preprocess_text(file_name, mime_type: str) -> str:
agent/models.py CHANGED
@@ -13,4 +13,8 @@ llm_peripheral = init_chat_model(
13
 
14
  llm_agents = init_chat_model(
15
  model="groq:qwen/qwen3-32b"
 
 
 
 
16
  )
 
13
 
14
  llm_agents = init_chat_model(
15
  model="groq:qwen/qwen3-32b"
16
+ )
17
+
18
+ llm_image = init_chat_model(
19
+ model="groq:meta-llama/llama-4-scout-17b-16e-instruct"
20
  )
app.py CHANGED
@@ -29,14 +29,17 @@ async def file_mode(
29
  data = await file.read()
30
  dest.write_bytes(data)
31
 
 
 
 
 
32
  # Preprocess file
33
  transcription = await preprocess_file(str(dest))
34
  state_data["message"] = HumanMessage(
35
  content=transcription
36
  )
37
  # Call the agents
38
- assistant = Assistant(state=state_data)
39
- await assistant.authorization()
40
  response = await assistant.run()
41
  os.remove(str(dest))
42
  return response
@@ -57,6 +60,10 @@ async def file_mode(
57
  data = await file.read()
58
  dest.write_bytes(data)
59
 
 
 
 
 
60
  # Preprocess file
61
  file_contents = await preprocess_file(str(dest))
62
  state_data["message"] = HumanMessage(
@@ -64,8 +71,7 @@ async def file_mode(
64
  )
65
 
66
  # Call the agents
67
- assistant = Assistant(state=state_data)
68
- await assistant.authorization()
69
  response = await assistant.run()
70
  os.remove(str(dest))
71
  return response
@@ -86,6 +92,10 @@ async def file_mode(
86
  data = await file.read()
87
  dest.write_bytes(data)
88
 
 
 
 
 
89
  # Preprocess file
90
  file_contents = await preprocess_file(str(dest))
91
  state_data["message"] = HumanMessage(
@@ -93,8 +103,7 @@ async def file_mode(
93
  )
94
 
95
  # Call the agents
96
- assistant = Assistant(state=state_data)
97
- await assistant.authorization()
98
  response = await assistant.run()
99
  os.remove(str(dest))
100
  return response
 
29
  data = await file.read()
30
  dest.write_bytes(data)
31
 
32
+ # Initialize agent
33
+ assistant = Assistant(state=state_data)
34
+ await assistant.authorization()
35
+
36
  # Preprocess file
37
  transcription = await preprocess_file(str(dest))
38
  state_data["message"] = HumanMessage(
39
  content=transcription
40
  )
41
  # Call the agents
42
+ assistant.state = state_data
 
43
  response = await assistant.run()
44
  os.remove(str(dest))
45
  return response
 
60
  data = await file.read()
61
  dest.write_bytes(data)
62
 
63
+ # Initialize agent
64
+ assistant = Assistant(state=state_data)
65
+ await assistant.authorization()
66
+
67
  # Preprocess file
68
  file_contents = await preprocess_file(str(dest))
69
  state_data["message"] = HumanMessage(
 
71
  )
72
 
73
  # Call the agents
74
+ assistant.state = state_data
 
75
  response = await assistant.run()
76
  os.remove(str(dest))
77
  return response
 
92
  data = await file.read()
93
  dest.write_bytes(data)
94
 
95
+ #Initialize agent
96
+ assistant = Assistant(state=state_data)
97
+ await assistant.authorization()
98
+
99
  # Preprocess file
100
  file_contents = await preprocess_file(str(dest))
101
  state_data["message"] = HumanMessage(
 
103
  )
104
 
105
  # Call the agents
106
+ assistant.state = state_data
 
107
  response = await assistant.run()
108
  os.remove(str(dest))
109
  return response