hugging2021 commited on
Commit
24164ce
·
verified ·
1 Parent(s): 413efae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -65
app.py CHANGED
@@ -3,7 +3,8 @@ import os
3
  import base64
4
  import pandas as pd
5
  from PIL import Image
6
- from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, OpenAIServerModel, tool, Tool
 
7
  from typing import Optional
8
  import requests
9
  from io import BytesIO
@@ -21,14 +22,13 @@ from odf.opendocument import load as load_odt
21
  ## utilties and class definition
22
  def is_image_extension(filename: str) -> bool:
23
  IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg'}
24
- ext = os.path.splitext(filename)[1].lower() # os.path.splitext(path) returns (root, ext)
25
  return ext in IMAGE_EXTS
26
 
27
  def load_file(path: str) -> dict:
28
  """Based on the file extension, load the file into a suitable object."""
29
-
30
  text = None
31
- ext = Path(path).suffix.lower() # same as os.path.splitext(filename)[1].lower()
32
 
33
  match ext:
34
  case '.jpg'| '.jpeg'| '.png'| '.gif'| '.bmp'| '.tiff'| '.webp'| '.svg':
@@ -36,31 +36,29 @@ def load_file(path: str) -> dict:
36
  case '.docx':
37
  text = docx2txt.process(path)
38
  case ".xlsx" | ".xls" :
39
- text = pd.read_excel(path) # DataFrame
40
  text = str(text).strip()
41
  case '.odt':
42
  text = load_odt(path)
43
  text = str(text.body).strip()
44
- pass
45
  case ".csv":
46
- text = pd.read_csv(path) # DataFrame
47
  text = str(text).strip()
48
  case ".pdf":
49
  with pdfplumber.open(path) as pdf:
50
  text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
51
  case '.py' | '.txt':
52
  with open(path, 'r') as f:
53
- text = f.read() # plain text str
54
  case '.mp3' | '.wav':
55
  return {"audio path": path}
56
- case _: # default case
57
  text = None
58
 
59
  return {"raw document text": text, "file path": path}
60
 
61
  def check_format(answer: str | list, *args, **kwargs) -> list:
62
  """Check if the answer is a list and not a nested list."""
63
- # other args are ignored on purpose, they are there just for compatibility
64
  print("Checking format of the answer:", answer)
65
  if isinstance(answer, list):
66
  for item in answer:
@@ -87,18 +85,14 @@ def download_images(image_urls: str) -> list:
87
  Returns:
88
  List of PIL.Image.Image objects wrapped by gr.Image
89
  """
90
- urls = [u.strip() for u in image_urls.split(",") if u.strip()] # strip() removes whitespaces
91
  images = []
92
- for n_url, url in enumerate(urls, start=1): # enumerate seems not needed... keeping it for now
93
  try:
94
- # Fetch the image bytes
95
  resp = requests.get(url, timeout=10)
96
  resp.raise_for_status()
97
-
98
- # Load into a PIL image
99
  img = Image.open(BytesIO(resp.content)).convert("RGB")
100
  images.append(img)
101
-
102
  except Exception as e:
103
  print(f"Failed to download from url {n_url} ({url}): {e}")
104
 
@@ -107,7 +101,7 @@ def download_images(image_urls: str) -> list:
107
  wrapped.append(gr.Image(value=img))
108
  return wrapped
109
 
110
- @tool # since they gave us OpenAI API credits, we can keep using it
111
  def transcribe_audio(audio_path: str) -> str:
112
  """
113
  Transcribe audio file using OpenAI Whisper API.
@@ -118,7 +112,7 @@ def transcribe_audio(audio_path: str) -> str:
118
  """
119
  try:
120
  client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
121
- with open(audio_path, "rb") as audio: # to modify path because it is arriving from gradio
122
  transcript = client.audio.transcriptions.create(
123
  file=audio,
124
  model="whisper-1",
@@ -174,7 +168,6 @@ def generate_audio(prompt: str, duration: int) -> gr.Component:
174
  Returns:
175
  gr.Component: The generated audio as a Gradio Audio component.
176
  """
177
-
178
  DURATION_LIMIT = 30
179
  duration = duration if duration < DURATION_LIMIT else DURATION_LIMIT
180
 
@@ -186,7 +179,6 @@ def generate_audio(prompt: str, duration: int) -> gr.Component:
186
  )
187
 
188
  sound = client(prompt, duration)
189
-
190
  return gr.Audio(value=sound)
191
 
192
 
@@ -201,7 +193,6 @@ def generate_audio_from_sample(prompt: str, duration: int, sample_path: str = No
201
  Returns:
202
  gr.Component: The generated audio as a Gradio Audio component.
203
  """
204
-
205
  DURATION_LIMIT = 30
206
  duration = duration if duration < DURATION_LIMIT else DURATION_LIMIT
207
 
@@ -213,7 +204,6 @@ def generate_audio_from_sample(prompt: str, duration: int, sample_path: str = No
213
  )
214
 
215
  sound = client(prompt, duration, sample_path)
216
-
217
  return gr.Audio(value=sound)
218
 
219
  @tool
@@ -226,9 +216,10 @@ def caption_image(img_path: str, prompt: str) -> str:
226
  Returns:
227
  str: A description of the image.
228
  """
229
- client_2 = HfApiModel("google/gemma-3-27b-it",
230
- provider="nebius",
231
- api_key=os.getenv("NEBIUS_API_KEY"))
 
232
 
233
  with open(img_path, "rb") as f:
234
  encoded = base64.b64encode(f.read()).decode("utf-8")
@@ -251,15 +242,10 @@ def caption_image(img_path: str, prompt: str) -> str:
251
 
252
  ## agent definition
253
  class Agent:
254
- def __init__(self, ):
255
- #client = HfApiModel("deepseek-ai/DeepSeek-R1-0528", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
256
- client = HfApiModel("Qwen/Qwen3-32B", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
257
 
258
- """client = OpenAIServerModel(
259
- model_id="claude-opus-4-20250514",
260
- api_base="https://api.anthropic.com/v1/",
261
- api_key=os.environ["ANTHROPIC_API_KEY"],
262
- )"""
263
  self.agent = CodeAgent(
264
  model=client,
265
  tools=[DuckDuckGoSearchTool(max_results=5),
@@ -279,8 +265,6 @@ class Agent:
279
  with open("system_prompt.txt", "r") as f:
280
  system_prompt = f.read()
281
  self.agent.prompt_templates["system_prompt"] = system_prompt
282
-
283
- #print("System prompt:", self.agent.prompt_templates["system_prompt"])
284
 
285
  def __call__(self, message: str,
286
  images: Optional[list[Image.Image]] = None,
@@ -293,13 +277,12 @@ class Agent:
293
  ## gradio functions
294
  def respond(message: str, history : dict, web_search: bool = False):
295
  global agent
296
- # input
297
  print("history:", history)
298
  text = message.get("text", "")
299
- if not message.get("files") and not web_search: # no files uploaded
300
  print("No files received.")
301
- message = agent(text + "\nADDITIONAL CONTRAINT: Don't use web search", conversation_history=history) # conversation_history is a dict with the history of the conversation
302
- elif not message.get("files") and web_search: # no files uploaded
303
  print("No files received + web search enabled.")
304
  message = agent(text, conversation_history=history)
305
  else:
@@ -311,9 +294,7 @@ def respond(message: str, history : dict, web_search: bool = False):
311
  file = load_file(files[0])
312
  message = agent(text, files=file, conversation_history=history)
313
 
314
- # output
315
  print("Agent response:", message)
316
-
317
  return message
318
 
319
  def initialize_agent():
@@ -322,28 +303,8 @@ def initialize_agent():
322
  return agent
323
 
324
  ## gradio interface
325
- description = textwrap.dedent("""**Scriptura** is a multi-agent AI framework based on HF-SmolAgents that streamlines the creation of screenplays, storyboards,
326
- and soundtracks by automating the stages of analysis, summarization, and multimodal enrichment, freeing authors to focus on pure creativity.
327
- At its heart:
328
- - **Qwen3-32B** serves as the primary orchestrating agent, coordinating workflows and managing high-level reasoning across the system.
329
- - **Gemma-3-27B-IT** acts as a specialized assistant for multimodal tasks, supporting both text and audio inputs to refine narrative elements and prepare them for downstream generation.
330
-
331
- For media generation, Scriptura integrates:
332
- - **MusicGen** models (per the AudioCraft MusicGen specification), deployed via Hugging Face Spaces,
333
- enabling the agent to produce original soundtracks and sound effects from text prompts or combined text + audio samples.
334
- - **FLUX (black-forest-labs/FLUX.1-dev)** for on-the-fly image creation, ideal for storyboards, concept art, and
335
- visual references that seamlessly tie into the narrative flow.
336
-
337
- Optionally, Scriptura can query external sources (e.g., via a DuckDuckGo API integration) to pull in reference scripts, sound samples, or research materials,
338
- ensuring that every draft is not only creatively rich but also contextually informed.
339
-
340
- To view the presentation **video**, click [here](https://www.youtube.com/watch?v=I0201ruB1Uo&ab_channel=3DLabFactory)
341
-
342
- For more information: [README.md](https://huggingface.co/spaces/Agents-MCP-Hackathon/MultiAgent_System_for_Screenplay_Creation/blob/main/README.md)
343
-
344
- **Important**: if you’re interested in trying the sound generation feature, please open a discussion to request that we restart our custom space. We have limited credits, so we appreciate your understanding 🤓
345
- """)
346
-
347
  # global agent
348
  agent = initialize_agent()
349
  demo = gr.ChatInterface(
@@ -359,7 +320,7 @@ demo = gr.ChatInterface(
359
  autoscroll=True,
360
  additional_inputs=[
361
  gr.Checkbox(value=False, label="Web Search",
362
- info="Enable web search to find information online. If disabled, the agent will only use the provided files and images.",
363
  render=False),
364
  ],
365
  additional_inputs_accordion=gr.Accordion(label="Tools available: ", open=True, render=False)
@@ -367,4 +328,4 @@ demo = gr.ChatInterface(
367
 
368
 
369
  if __name__ == "__main__":
370
- demo.launch()
 
3
  import base64
4
  import pandas as pd
5
  from PIL import Image
6
+ # HfApiModel wurde in HfModel umbenannt
7
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, HfModel, VisitWebpageTool, OpenAIServerModel, tool, Tool
8
  from typing import Optional
9
  import requests
10
  from io import BytesIO
 
22
  ## utilties and class definition
23
  def is_image_extension(filename: str) -> bool:
24
  IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg'}
25
+ ext = os.path.splitext(filename)[1].lower()
26
  return ext in IMAGE_EXTS
27
 
28
  def load_file(path: str) -> dict:
29
  """Based on the file extension, load the file into a suitable object."""
 
30
  text = None
31
+ ext = Path(path).suffix.lower()
32
 
33
  match ext:
34
  case '.jpg'| '.jpeg'| '.png'| '.gif'| '.bmp'| '.tiff'| '.webp'| '.svg':
 
36
  case '.docx':
37
  text = docx2txt.process(path)
38
  case ".xlsx" | ".xls" :
39
+ text = pd.read_excel(path)
40
  text = str(text).strip()
41
  case '.odt':
42
  text = load_odt(path)
43
  text = str(text.body).strip()
 
44
  case ".csv":
45
+ text = pd.read_csv(path)
46
  text = str(text).strip()
47
  case ".pdf":
48
  with pdfplumber.open(path) as pdf:
49
  text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
50
  case '.py' | '.txt':
51
  with open(path, 'r') as f:
52
+ text = f.read()
53
  case '.mp3' | '.wav':
54
  return {"audio path": path}
55
+ case _:
56
  text = None
57
 
58
  return {"raw document text": text, "file path": path}
59
 
60
  def check_format(answer: str | list, *args, **kwargs) -> list:
61
  """Check if the answer is a list and not a nested list."""
 
62
  print("Checking format of the answer:", answer)
63
  if isinstance(answer, list):
64
  for item in answer:
 
85
  Returns:
86
  List of PIL.Image.Image objects wrapped by gr.Image
87
  """
88
+ urls = [u.strip() for u in image_urls.split(",") if u.strip()]
89
  images = []
90
+ for n_url, url in enumerate(urls, start=1):
91
  try:
 
92
  resp = requests.get(url, timeout=10)
93
  resp.raise_for_status()
 
 
94
  img = Image.open(BytesIO(resp.content)).convert("RGB")
95
  images.append(img)
 
96
  except Exception as e:
97
  print(f"Failed to download from url {n_url} ({url}): {e}")
98
 
 
101
  wrapped.append(gr.Image(value=img))
102
  return wrapped
103
 
104
+ @tool
105
  def transcribe_audio(audio_path: str) -> str:
106
  """
107
  Transcribe audio file using OpenAI Whisper API.
 
112
  """
113
  try:
114
  client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
115
+ with open(audio_path, "rb") as audio:
116
  transcript = client.audio.transcriptions.create(
117
  file=audio,
118
  model="whisper-1",
 
168
  Returns:
169
  gr.Component: The generated audio as a Gradio Audio component.
170
  """
 
171
  DURATION_LIMIT = 30
172
  duration = duration if duration < DURATION_LIMIT else DURATION_LIMIT
173
 
 
179
  )
180
 
181
  sound = client(prompt, duration)
 
182
  return gr.Audio(value=sound)
183
 
184
 
 
193
  Returns:
194
  gr.Component: The generated audio as a Gradio Audio component.
195
  """
 
196
  DURATION_LIMIT = 30
197
  duration = duration if duration < DURATION_LIMIT else DURATION_LIMIT
198
 
 
204
  )
205
 
206
  sound = client(prompt, duration, sample_path)
 
207
  return gr.Audio(value=sound)
208
 
209
  @tool
 
216
  Returns:
217
  str: A description of the image.
218
  """
219
+ # Korrektur: HfModel statt HfApiModel
220
+ client_2 = HfModel("google/gemma-3-27b-it",
221
+ provider="nebius",
222
+ api_key=os.getenv("NEBIUS_API_KEY"))
223
 
224
  with open(img_path, "rb") as f:
225
  encoded = base64.b64encode(f.read()).decode("utf-8")
 
242
 
243
  ## agent definition
244
  class Agent:
245
+ def __init__(self):
246
+ # Korrektur: HfModel statt HfApiModel
247
+ client = HfModel("Qwen/Qwen3-32B", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
248
 
 
 
 
 
 
249
  self.agent = CodeAgent(
250
  model=client,
251
  tools=[DuckDuckGoSearchTool(max_results=5),
 
265
  with open("system_prompt.txt", "r") as f:
266
  system_prompt = f.read()
267
  self.agent.prompt_templates["system_prompt"] = system_prompt
 
 
268
 
269
  def __call__(self, message: str,
270
  images: Optional[list[Image.Image]] = None,
 
277
  ## gradio functions
278
  def respond(message: str, history : dict, web_search: bool = False):
279
  global agent
 
280
  print("history:", history)
281
  text = message.get("text", "")
282
+ if not message.get("files") and not web_search:
283
  print("No files received.")
284
+ message = agent(text + "\nADDITIONAL CONTRAINT: Don't use web search", conversation_history=history)
285
+ elif not message.get("files") and web_search:
286
  print("No files received + web search enabled.")
287
  message = agent(text, conversation_history=history)
288
  else:
 
294
  file = load_file(files[0])
295
  message = agent(text, files=file, conversation_history=history)
296
 
 
297
  print("Agent response:", message)
 
298
  return message
299
 
300
  def initialize_agent():
 
303
  return agent
304
 
305
  ## gradio interface
306
+ description = textwrap.dedent("""**Scriptura** is a multi-agent AI framework...""")
307
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  # global agent
309
  agent = initialize_agent()
310
  demo = gr.ChatInterface(
 
320
  autoscroll=True,
321
  additional_inputs=[
322
  gr.Checkbox(value=False, label="Web Search",
323
+ info="Enable web search to find information online.",
324
  render=False),
325
  ],
326
  additional_inputs_accordion=gr.Accordion(label="Tools available: ", open=True, render=False)
 
328
 
329
 
330
  if __name__ == "__main__":
331
+ demo.launch()