introlix_api / main.py
satyam998's picture
Initial commit
79d285f
import subprocess
def run_app():
command = ["scrapy", "crawl", "generic"]
working_directory = "src/introlix_api/app/introlix_spider"
result = subprocess.run(command, cwd=working_directory, capture_output=True, text=True)
print("Output:", result.stdout)
print("Error:", result.stderr)
if __name__ == "__main__":
# running the spider
run_app()
# def run_get_urls_from_page_parallel(self, urls: list, max_workers: int=10) -> list:
# """
# Running get_urls_from_page function in parallel for many runs.
# Args:
# urls (list): list of urls
# max_workers (int, optional): number of workers. Defaults to 10.
# Returns:
# list: list of fetched urls
# """
# fetched_urls = []
# with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# futures = {executor.submit(self.get_urls_from_page, url): url for url in urls}
# for future in concurrent.futures.as_completed(futures):
# url = futures[future]
# try:
# result = future.result()
# fetched_urls.append(result)
# except Exception as e:
# raise CustomException(e, sys) from e
# return list(set(list(url for sublist in fetched_urls if sublist is not None for url in sublist)))