inank commited on
Commit
fd4e7f3
·
verified ·
1 Parent(s): 6d3d769

feat: add new tools & enable file upload

Browse files

Added PDF extractor and a text analyze-and-summarize tool.
Enabled file upload by providing a temp folder to the app launcher.

Files changed (1) hide show
  1. tools/pdf_extractor.py +105 -20
tools/pdf_extractor.py CHANGED
@@ -1,30 +1,115 @@
1
- from smolagents import tool
2
- import PyPDF2
 
 
 
 
 
3
 
 
4
 
5
  @tool
6
- def extract_text_from_pdf(pdf_path: str) -> str:
7
- """Extracts all text content from a PDF file.
 
 
 
8
 
9
  Args:
10
- pdf_path: The file path to the PDF file to extract text from (e.g., '/tmp/document.pdf')
 
11
 
12
  Returns:
13
- The extracted text content from the PDF file
14
  """
15
  try:
16
- extracted_text = []
17
- with open(pdf_path, 'rb') as pdf_file:
18
- pdf_reader = PyPDF2.PdfReader(pdf_file)
19
- num_pages = len(pdf_reader.pages)
20
-
21
- for page_num in range(num_pages):
22
- page = pdf_reader.pages[page_num]
23
- text = page.extract_text()
24
- extracted_text.append(f"--- Page {page_num + 1} ---\n{text}")
25
-
26
- return "\n\n".join(extracted_text)
27
- except FileNotFoundError:
28
- return f"Error: PDF file not found at path: {pdf_path}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
- return f"Error extracting text from PDF: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
2
+ import datetime
3
+ import requests
4
+ import pytz
5
+ import yaml
6
+ from tools.final_answer import FinalAnswerTool
7
+ from tools.pdf_extractor import extract_text_from_pdf
8
 
9
+ from Gradio_UI import GradioUI
10
 
11
  @tool
12
+ def summarize_and_analyze_text(text: str, max_sentences: int = 5) -> str:
13
+ """Analyzes and summarizes text content, extracting key information and main ideas.
14
+
15
+ This tool intelligently condenses lengthy text into concise summaries while preserving
16
+ the most important information. Perfect for processing search results, PDFs, and documents.
17
 
18
  Args:
19
+ text: The text content to summarize and analyze
20
+ max_sentences: Maximum number of sentences in the summary (default: 5)
21
 
22
  Returns:
23
+ A formatted summary containing key points and main ideas from the text
24
  """
25
  try:
26
+ # Remove extra whitespace and normalize text
27
+ text = " ".join(text.split())
28
+
29
+ if len(text) < 100:
30
+ return f"Text is too short to summarize. Original text:\n{text}"
31
+
32
+ # Split into sentences (simple approach)
33
+ sentences = []
34
+ import re
35
+ for sent in re.split(r'(?<=[.!?])\s+', text):
36
+ sent = sent.strip()
37
+ if sent:
38
+ sentences.append(sent)
39
+
40
+ # Score sentences based on word frequency
41
+ words = text.lower().split()
42
+ word_freq = {}
43
+ for word in words:
44
+ if len(word) > 3: # Filter short words
45
+ word_freq[word] = word_freq.get(word, 0) + 1
46
+
47
+ # Select top sentences
48
+ sentence_scores = []
49
+ for i, sent in enumerate(sentences):
50
+ score = sum(word_freq.get(word.lower(), 0) for word in sent.split())
51
+ sentence_scores.append((i, score, sent))
52
+
53
+ # Sort by original order but select based on scores
54
+ top_indices = sorted([idx for idx, _, _ in sorted(sentence_scores, key=lambda x: -x[1])[:max_sentences]])
55
+ summary_sentences = [sent for idx, _, sent in sentence_scores if idx in top_indices]
56
+
57
+ summary = " ".join(summary_sentences)
58
+
59
+ # Extract key entities (words that appear frequently)
60
+ sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
61
+ key_terms = ", ".join([word for word, _ in sorted_words[:5]])
62
+
63
+ return f"""📋 SUMMARY:\n{summary}\n\n🔑 KEY TERMS: {key_terms}\n\n📊 ANALYSIS:\n- Text length: {len(text)} characters\n- Total sentences: {len(sentences)}\n- Summary length: {len(summary_sentences)} sentences"""
64
  except Exception as e:
65
+ return f"Error analyzing text: {str(e)}"
66
+
67
+ @tool
68
+ def get_current_time_in_timezone(timezone: str) -> str:
69
+ """A tool that fetches the current local time in a specified timezone.
70
+ Args:
71
+ timezone: A string representing a valid timezone (e.g., 'America/New_York').
72
+ """
73
+ try:
74
+ # Create timezone object
75
+ tz = pytz.timezone(timezone)
76
+ # Get current time in that timezone
77
+ local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
78
+ return f"The current local time in {timezone} is: {local_time}"
79
+ except Exception as e:
80
+ return f"Error fetching time for timezone '{timezone}': {str(e)}"
81
+
82
+
83
+ final_answer = FinalAnswerTool()
84
+
85
+ # If the agent does not answer, the model is overloaded, please use another model or the following Hugging Face Endpoint that also contains qwen2.5 coder:
86
+ # model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud'
87
+
88
+ model = HfApiModel(
89
+ max_tokens=2096,
90
+ temperature=0.5,
91
+ model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
92
+ custom_role_conversions=None,
93
+ )
94
+
95
+
96
+ # Import tool from Hub
97
+ image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
98
+
99
+ with open("prompts.yaml", 'r') as stream:
100
+ prompt_templates = yaml.safe_load(stream)
101
+
102
+ agent = CodeAgent(
103
+ model=model,
104
+ tools=[image_generation_tool,get_current_time_in_timezone,extract_text_from_pdf,summarize_and_analyze_text,final_answer], ## add your tools here (don't remove final answer)
105
+ max_steps=6,
106
+ verbosity_level=1,
107
+ grammar=None,
108
+ planning_interval=None,
109
+ name=None,
110
+ description=None,
111
+ prompt_templates=prompt_templates
112
+ )
113
+
114
+
115
+ GradioUI(agent, "/tmp").launch()