Spaces:
Configuration error
Configuration error
Upload 20 files
Browse files- ValidationAgent/.DS_Store +0 -0
- ValidationAgent/ValidationAgent.py +126 -0
- ValidationAgent/__init__.py +1 -0
- ValidationAgent/__pycache__/ValidationAgent.cpython-311.pyc +0 -0
- ValidationAgent/__pycache__/ValidationAgent.cpython-313.pyc +0 -0
- ValidationAgent/__pycache__/__init__.cpython-311.pyc +0 -0
- ValidationAgent/__pycache__/__init__.cpython-313.pyc +0 -0
- ValidationAgent/instructions.md +12 -0
- ValidationAgent/tools/DataValidationTool.py +54 -0
- ValidationAgent/tools/FirecrawlDataScraperTool.py +40 -0
- ValidationAgent/tools/GapIdentificationTool.py +44 -0
- ValidationAgent/tools/ReportReviewTool.py +47 -0
- ValidationAgent/tools/SearchAndScrape.py +76 -0
- ValidationAgent/tools/__init__.py +1 -0
- ValidationAgent/tools/__pycache__/DataValidationTool.cpython-311.pyc +0 -0
- ValidationAgent/tools/__pycache__/FirecrawlDataScraperTool.cpython-311.pyc +0 -0
- ValidationAgent/tools/__pycache__/GapIdentificationTool.cpython-311.pyc +0 -0
- ValidationAgent/tools/__pycache__/ReportReviewTool.cpython-311.pyc +0 -0
- ValidationAgent/tools/__pycache__/SearchAndScrape.cpython-311.pyc +0 -0
- ValidationAgent/tools/__pycache__/__init__.cpython-311.pyc +0 -0
ValidationAgent/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
ValidationAgent/ValidationAgent.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agency_swarm.agents import Agent
|
| 2 |
+
from agency_swarm.tools import CodeInterpreter
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from .tools.SearchAndScrape import SearchAndScrape
|
| 6 |
+
|
| 7 |
+
class ValidationAgent(Agent):
|
| 8 |
+
def __init__(self):
|
| 9 |
+
super().__init__(
|
| 10 |
+
name="ValidationAgent",
|
| 11 |
+
description="This agent validates market research reports using AI and ensures data completeness.",
|
| 12 |
+
instructions="./instructions.md",
|
| 13 |
+
files_folder="./files",
|
| 14 |
+
schemas_folder="./schemas",
|
| 15 |
+
tools=[SearchAndScrape],
|
| 16 |
+
tools_folder="./tools",
|
| 17 |
+
temperature=0.3,
|
| 18 |
+
model="groq/llama-3.3-70b-versatile",
|
| 19 |
+
max_prompt_tokens=25000,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
def validate_data(self, report_data):
|
| 23 |
+
"""Validate the report using AI and fill gaps if needed."""
|
| 24 |
+
validation_prompt = f"""
|
| 25 |
+
Analyze this market research report for quality and completeness:
|
| 26 |
+
|
| 27 |
+
{report_data}
|
| 28 |
+
|
| 29 |
+
Please check for:
|
| 30 |
+
1. Missing key information
|
| 31 |
+
2. Data accuracy and consistency
|
| 32 |
+
3. Logical flow and structure
|
| 33 |
+
4. Completeness of sections:
|
| 34 |
+
- Market Size & Growth
|
| 35 |
+
- Competitive Landscape
|
| 36 |
+
- Consumer Analysis
|
| 37 |
+
- Technology & Innovation
|
| 38 |
+
- Future Outlook
|
| 39 |
+
|
| 40 |
+
Provide a detailed assessment with:
|
| 41 |
+
1. Quality score (0-100)
|
| 42 |
+
2. List of missing or incomplete sections
|
| 43 |
+
3. Specific recommendations for improvement
|
| 44 |
+
4. Additional data points needed
|
| 45 |
+
|
| 46 |
+
Format: JSON with these keys:
|
| 47 |
+
{
|
| 48 |
+
"quality_score": int,
|
| 49 |
+
"missing_sections": list,
|
| 50 |
+
"recommendations": list,
|
| 51 |
+
"additional_data_needed": list,
|
| 52 |
+
"is_complete": boolean
|
| 53 |
+
}
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
model = self._get_model() # Updated method to get model
|
| 58 |
+
response = model.generate_content(validation_prompt)
|
| 59 |
+
validation_result = response.text
|
| 60 |
+
|
| 61 |
+
# If validation shows missing data, scrape for it
|
| 62 |
+
if '"is_complete": false' in validation_result.lower():
|
| 63 |
+
missing_data = self._fill_missing_data(validation_result)
|
| 64 |
+
if missing_data:
|
| 65 |
+
# Combine original report with new data
|
| 66 |
+
updated_report = self._merge_reports(report_data, missing_data)
|
| 67 |
+
# Validate again
|
| 68 |
+
return self.validate_data(updated_report)
|
| 69 |
+
|
| 70 |
+
return validation_result
|
| 71 |
+
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logging.error(f"Validation error: {str(e)}")
|
| 74 |
+
return {"error": str(e)}
|
| 75 |
+
|
| 76 |
+
def _fill_missing_data(self, validation_result):
|
| 77 |
+
"""Fill missing data based on validation results."""
|
| 78 |
+
try:
|
| 79 |
+
# Extract missing sections from validation result
|
| 80 |
+
import json
|
| 81 |
+
result = json.loads(validation_result)
|
| 82 |
+
missing_sections = result.get("missing_sections", [])
|
| 83 |
+
|
| 84 |
+
additional_data = []
|
| 85 |
+
for section in missing_sections:
|
| 86 |
+
# Create specific search query for missing section
|
| 87 |
+
search_query = f"{section} market research data analysis"
|
| 88 |
+
tool = SearchAndScrape(query=search_query)
|
| 89 |
+
section_data = tool.run()
|
| 90 |
+
if section_data:
|
| 91 |
+
additional_data.append({
|
| 92 |
+
"section": section,
|
| 93 |
+
"content": section_data
|
| 94 |
+
})
|
| 95 |
+
|
| 96 |
+
return additional_data if additional_data else None
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logging.error(f"Error filling missing data: {str(e)}")
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
def _merge_reports(self, original_report, new_data):
|
| 103 |
+
"""Merge original report with newly scraped data."""
|
| 104 |
+
merge_prompt = f"""
|
| 105 |
+
Merge this original report with new data:
|
| 106 |
+
|
| 107 |
+
Original Report:
|
| 108 |
+
{original_report}
|
| 109 |
+
|
| 110 |
+
New Data to Add:
|
| 111 |
+
{new_data}
|
| 112 |
+
|
| 113 |
+
Please create a cohesive, well-structured report that incorporates all information without duplication.
|
| 114 |
+
Ensure proper flow and transitions between sections.
|
| 115 |
+
"""
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
model = self._get_model() # Updated method to get model
|
| 119 |
+
response = model.generate_content(merge_prompt)
|
| 120 |
+
return response.text
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logging.error(f"Error merging reports: {str(e)}")
|
| 123 |
+
return original_report
|
| 124 |
+
|
| 125 |
+
def response_validator(self, message):
|
| 126 |
+
return message
|
ValidationAgent/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .ValidationAgent import ValidationAgent
|
ValidationAgent/__pycache__/ValidationAgent.cpython-311.pyc
ADDED
|
Binary file (5.86 kB). View file
|
|
|
ValidationAgent/__pycache__/ValidationAgent.cpython-313.pyc
ADDED
|
Binary file (1.42 kB). View file
|
|
|
ValidationAgent/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (272 Bytes). View file
|
|
|
ValidationAgent/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (249 Bytes). View file
|
|
|
ValidationAgent/instructions.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ValidationAgent Instructions
|
| 2 |
+
|
| 3 |
+
You are an agent responsible for reviewing draft reports for completeness, accuracy, and quality. Your role is to identify gaps and inconsistencies in the data and use Firecrawl to scrape additional data to fill these gaps or improve existing sections.
|
| 4 |
+
|
| 5 |
+
### Primary Instructions:
|
| 6 |
+
1. Review the draft report provided by the WebScraperAgent for completeness and accuracy.
|
| 7 |
+
2. Identify any gaps or inconsistencies in the data presented in the draft report.
|
| 8 |
+
3. Use Firecrawl to scrape additional data to fill identified gaps or improve existing sections of the report.
|
| 9 |
+
4. Validate the newly gathered data for relevance and accuracy.
|
| 10 |
+
5. Compile the validated data into a final report format.
|
| 11 |
+
6. Communicate with the WebScraperAgent to address any persistent issues or discrepancies in the data.
|
| 12 |
+
7. Ensure the final report is accurate, complete, and ready for submission to the MarketInsightsCEO.
|
ValidationAgent/tools/DataValidationTool.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agency_swarm.tools import BaseTool
|
| 2 |
+
from pydantic import Field
|
| 3 |
+
import re
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
|
| 6 |
+
class DataValidationTool(BaseTool):
|
| 7 |
+
"""
|
| 8 |
+
This tool ensures that all data is accurate and compiles it into a cohesive final report.
|
| 9 |
+
It validates numerical consistency and checks for valid date formats.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
raw_data: str = Field(
|
| 13 |
+
..., description="The raw data to be validated and compiled into the final report."
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
expected_repetitions: dict = Field(
|
| 17 |
+
default_factory=dict,
|
| 18 |
+
description="A dictionary specifying numbers that are expected to repeat and their expected counts."
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
def run(self):
|
| 22 |
+
"""
|
| 23 |
+
Validates the accuracy of the data and compiles it into a cohesive final report.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
# Validate numerical consistency
|
| 27 |
+
numbers = re.findall(r'\b\d+\b', self.raw_data)
|
| 28 |
+
number_counts = defaultdict(int)
|
| 29 |
+
|
| 30 |
+
for number in numbers:
|
| 31 |
+
number_counts[number] += 1
|
| 32 |
+
|
| 33 |
+
conflicting_numbers = []
|
| 34 |
+
for number, count in number_counts.items():
|
| 35 |
+
expected_count = self.expected_repetitions.get(number, 0)
|
| 36 |
+
if expected_count == 0:
|
| 37 |
+
# If no expected count is provided, assume the number should appear once
|
| 38 |
+
expected_count = 1
|
| 39 |
+
if count != expected_count:
|
| 40 |
+
conflicting_numbers.append(f"{number} (found {count}, expected {expected_count})")
|
| 41 |
+
|
| 42 |
+
if conflicting_numbers:
|
| 43 |
+
return f"Data validation failed: Conflicting numerical data found for numbers: {', '.join(conflicting_numbers)}."
|
| 44 |
+
|
| 45 |
+
# Check for valid date formats (e.g., YYYY-MM-DD)
|
| 46 |
+
dates = re.findall(r'\b\d{4}-\d{2}-\d{2}\b', self.raw_data)
|
| 47 |
+
if not dates:
|
| 48 |
+
return "Data validation failed: No valid date formats found."
|
| 49 |
+
|
| 50 |
+
# Compile the validated data into a final report
|
| 51 |
+
final_report = f"Final Report:\n\n{self.raw_data}"
|
| 52 |
+
|
| 53 |
+
# Return the final report
|
| 54 |
+
return final_report
|
ValidationAgent/tools/FirecrawlDataScraperTool.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agency_swarm.tools import BaseTool
|
| 2 |
+
from pydantic import Field
|
| 3 |
+
import requests
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Assuming Firecrawl API requires an API key
|
| 7 |
+
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
| 8 |
+
firecrawl_base_url = "https://api.firecrawl.com/data"
|
| 9 |
+
|
| 10 |
+
class FirecrawlDataScraperTool(BaseTool):
|
| 11 |
+
"""
|
| 12 |
+
This tool interfaces with Firecrawl to gather additional data based on identified gaps or areas needing improvement in the report.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
gap_description: str = Field(
|
| 16 |
+
..., description="Description of the identified gap or area needing improvement in the report."
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
def run(self):
|
| 20 |
+
"""
|
| 21 |
+
Interfaces with Firecrawl to gather additional data based on the provided gap description.
|
| 22 |
+
"""
|
| 23 |
+
# Prepare the request to Firecrawl API
|
| 24 |
+
headers = {
|
| 25 |
+
"Authorization": f"Bearer {firecrawl_api_key}",
|
| 26 |
+
"Content-Type": "application/json"
|
| 27 |
+
}
|
| 28 |
+
payload = {
|
| 29 |
+
"query": self.gap_description
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
# Send the request to Firecrawl API
|
| 33 |
+
response = requests.post(firecrawl_base_url, json=payload, headers=headers)
|
| 34 |
+
|
| 35 |
+
# Check if the request was successful
|
| 36 |
+
if response.status_code == 200:
|
| 37 |
+
data = response.json()
|
| 38 |
+
return f"Data gathered from Firecrawl: {data}"
|
| 39 |
+
else:
|
| 40 |
+
return f"Failed to gather data from Firecrawl. Status code: {response.status_code}, Error: {response.text}"
|
ValidationAgent/tools/GapIdentificationTool.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agency_swarm.tools import BaseTool
|
| 2 |
+
from pydantic import Field
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
class GapIdentificationTool(BaseTool):
|
| 6 |
+
"""
|
| 7 |
+
This tool analyzes the report to find any logical gaps or inconsistencies in the data or narrative.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
report_content: str = Field(
|
| 11 |
+
..., description="The content of the report to be analyzed for logical gaps or inconsistencies."
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
def run(self):
|
| 15 |
+
"""
|
| 16 |
+
Analyzes the report content to identify logical gaps or inconsistencies in the data or narrative.
|
| 17 |
+
"""
|
| 18 |
+
# Define patterns or keywords that might indicate logical gaps or inconsistencies
|
| 19 |
+
gap_indicators = [
|
| 20 |
+
"however", "but", "although", "nevertheless", "in contrast", "on the other hand"
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
# Find sentences with potential logical gaps
|
| 24 |
+
potential_gaps = []
|
| 25 |
+
sentences = re.split(r'(?<=[.!?]) +', self.report_content)
|
| 26 |
+
for sentence in sentences:
|
| 27 |
+
if any(indicator in sentence for indicator in gap_indicators):
|
| 28 |
+
potential_gaps.append(sentence)
|
| 29 |
+
|
| 30 |
+
# Check for inconsistencies in data (e.g., conflicting numbers)
|
| 31 |
+
# This is a simple example using regex to find numbers
|
| 32 |
+
numbers = re.findall(r'\b\d+\b', self.report_content)
|
| 33 |
+
inconsistencies = []
|
| 34 |
+
if len(set(numbers)) != len(numbers):
|
| 35 |
+
inconsistencies.append("Conflicting numerical data found.")
|
| 36 |
+
|
| 37 |
+
# Compile the analysis results
|
| 38 |
+
analysis_results = {
|
| 39 |
+
"potential_gaps": potential_gaps,
|
| 40 |
+
"inconsistencies": inconsistencies
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Return the analysis results as a string
|
| 44 |
+
return f"Analysis Results: {analysis_results}"
|
ValidationAgent/tools/ReportReviewTool.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agency_swarm.tools import BaseTool
|
| 2 |
+
from pydantic import Field
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
class ReportReviewTool(BaseTool):
|
| 6 |
+
"""
|
| 7 |
+
This tool analyzes the content of draft reports, checking for completeness, accuracy, and quality.
|
| 8 |
+
It identifies any missing sections or errors in the report.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
report_content: str = Field(
|
| 12 |
+
..., description="The content of the draft report to be analyzed."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
def run(self):
|
| 16 |
+
"""
|
| 17 |
+
Analyzes the report content for completeness, accuracy, and quality.
|
| 18 |
+
Identifies missing sections or errors in the report.
|
| 19 |
+
"""
|
| 20 |
+
# Define the expected sections in a report
|
| 21 |
+
expected_sections = [
|
| 22 |
+
"Introduction", "Methodology", "Results", "Discussion", "Conclusion"
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
# Check for missing sections
|
| 26 |
+
missing_sections = [
|
| 27 |
+
section for section in expected_sections if section not in self.report_content
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
# Check for common errors (e.g., spelling mistakes)
|
| 31 |
+
# This is a simple example using regex to find repeated words
|
| 32 |
+
errors = re.findall(r'\b(\w+)\s+\1\b', self.report_content)
|
| 33 |
+
|
| 34 |
+
# Check for quality (e.g., length of the report)
|
| 35 |
+
quality_issues = []
|
| 36 |
+
if len(self.report_content.split()) < 500:
|
| 37 |
+
quality_issues.append("The report is too short, consider adding more content.")
|
| 38 |
+
|
| 39 |
+
# Compile the analysis results
|
| 40 |
+
analysis_results = {
|
| 41 |
+
"missing_sections": missing_sections,
|
| 42 |
+
"errors": errors,
|
| 43 |
+
"quality_issues": quality_issues
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
# Return the analysis results as a string
|
| 47 |
+
return f"Analysis Results: {analysis_results}"
|
ValidationAgent/tools/SearchAndScrape.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agency_swarm.tools import BaseTool
|
| 2 |
+
from pydantic import Field
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
from firecrawl import FirecrawlApp
|
| 9 |
+
except ImportError:
|
| 10 |
+
raise ImportError(
|
| 11 |
+
"Required packages not found. Please install them using:\n"
|
| 12 |
+
"pip install firecrawl"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
# Initialize Firecrawl
|
| 16 |
+
FIRECRAWL_API_KEY = "fc-5fadfeae30314d4ea8a3d9afaa75c493"
|
| 17 |
+
firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
|
| 18 |
+
|
| 19 |
+
class SearchAndScrape(BaseTool):
|
| 20 |
+
"""
|
| 21 |
+
This tool scrapes content using Firecrawl based on a provided query.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
query: str = Field(
|
| 25 |
+
...,
|
| 26 |
+
description="The search query to look for",
|
| 27 |
+
examples=["market trends in technology 2024", "AI industry statistics"]
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
def scrape_with_retry(self, url, max_retries=3):
|
| 31 |
+
"""Helper function to scrape URL with retry logic"""
|
| 32 |
+
problematic_domains = [
|
| 33 |
+
'sparktoro.com', 'j-jdis.com', 'linkedin.com',
|
| 34 |
+
'facebook.com', 'twitter.com', 'reddit.com', '.pdf'
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
if any(domain in url.lower() for domain in problematic_domains):
|
| 38 |
+
logging.info(f"Skipping problematic URL: {url}")
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
for attempt in range(max_retries):
|
| 42 |
+
try:
|
| 43 |
+
response = firecrawl_app.scrape_url(
|
| 44 |
+
url=url,
|
| 45 |
+
params={'formats': ['markdown']}
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
if response and response.get('markdown'):
|
| 49 |
+
content = response.get('markdown')
|
| 50 |
+
if len(content.strip()) > 200:
|
| 51 |
+
return content
|
| 52 |
+
return None
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logging.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
|
| 55 |
+
if attempt < max_retries - 1:
|
| 56 |
+
time.sleep(2)
|
| 57 |
+
continue
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
def run(self):
|
| 61 |
+
logging.info(f"Scraping content for query: {self.query}")
|
| 62 |
+
# Here you would typically have a list of URLs to scrape based on the query.
|
| 63 |
+
# For this example, we will assume a predefined list of URLs.
|
| 64 |
+
search_results = ["http://example.com/article1", "http://example.com/article2"] # Placeholder URLs
|
| 65 |
+
|
| 66 |
+
if not search_results:
|
| 67 |
+
return "No search results found."
|
| 68 |
+
|
| 69 |
+
for url in search_results:
|
| 70 |
+
logging.info(f"Attempting to scrape URL: {url}")
|
| 71 |
+
content = self.scrape_with_retry(url)
|
| 72 |
+
if content:
|
| 73 |
+
logging.info(f"Successfully scraped content from {url}")
|
| 74 |
+
return f"Content from {url}:\n\n{content}"
|
| 75 |
+
|
| 76 |
+
return "Failed to scrape content from any of the search results"
|
ValidationAgent/tools/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .SearchAndScrape import SearchAndScrape
|
ValidationAgent/tools/__pycache__/DataValidationTool.cpython-311.pyc
ADDED
|
Binary file (2.72 kB). View file
|
|
|
ValidationAgent/tools/__pycache__/FirecrawlDataScraperTool.cpython-311.pyc
ADDED
|
Binary file (2.09 kB). View file
|
|
|
ValidationAgent/tools/__pycache__/GapIdentificationTool.cpython-311.pyc
ADDED
|
Binary file (2.38 kB). View file
|
|
|
ValidationAgent/tools/__pycache__/ReportReviewTool.cpython-311.pyc
ADDED
|
Binary file (2.23 kB). View file
|
|
|
ValidationAgent/tools/__pycache__/SearchAndScrape.cpython-311.pyc
ADDED
|
Binary file (4.09 kB). View file
|
|
|
ValidationAgent/tools/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (278 Bytes). View file
|
|
|