Spaces:
Runtime error
Runtime error
| """ | |
| Main pipeline for Paper2ProjectPage. | |
| Integrates all modules to generate project pages from research papers. | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import time | |
| from dotenv import load_dotenv | |
| from pathlib import Path | |
| import shutil | |
| from ProjectPageAgent.parse_paper import parse_paper_for_project_page, save_parsed_content | |
| from ProjectPageAgent.html_finder import HtmlFinder | |
| from ProjectPageAgent.content_planner import ProjectPageContentPlanner | |
| from ProjectPageAgent.html_generator import ProjectPageHTMLGenerator,to_url | |
| from utils.wei_utils import get_agent_config | |
| from ProjectPageAgent.content_planner import filter_references | |
| from utils.src.utils import run_sync_screenshots | |
| load_dotenv() | |
| def matching(requirement): | |
| weight = { | |
| "background_color": 1.0, | |
| "has_hero_section": 0.75, | |
| "Page density": 0.85, | |
| "image_layout": 0.65, | |
| "title_color": 0.6, | |
| "has_navigation": 0.7 | |
| } | |
| with open('tags.json', 'r') as f: | |
| template_tags = json.load(f) | |
| points = {} | |
| for name, tag in template_tags.items(): | |
| for feature, value in tag.items(): | |
| if requirement[feature] == value: | |
| if name not in points.keys(): | |
| points[name] = weight[feature] | |
| else: | |
| points[name] += weight[feature] | |
| sorted_points = sorted(points.items(), key=lambda x: x[1], reverse=True) | |
| return [template[0] for template in sorted_points[0:3]] | |
| def copy_static_files(template_file_path, template_root_dir, output_dir, paper_name): | |
| print(f"Detecting Static files: {template_file_path}") | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Create output directory for this specific project | |
| project_output_dir = f"{output_dir}/{paper_name}" | |
| os.makedirs(project_output_dir, exist_ok=True) | |
| # template_dir = os.path.dirname(template_file_path) | |
| static_dir = os.path.join(project_output_dir, 'static') | |
| os.makedirs(static_dir, exist_ok=True) | |
| html_relative_path = os.path.relpath(template_file_path, template_root_dir) | |
| # template_static_dir = os.path.join(template_dir, 'static') | |
| if os.path.exists(template_root_dir) and os.path.isdir(template_root_dir): | |
| print(f"Found template dir: {template_root_dir}") | |
| try: | |
| shutil.copytree(template_root_dir, project_output_dir, dirs_exist_ok=True) | |
| os.remove(os.path.join(project_output_dir, html_relative_path)) | |
| print(f"Copied template to: {project_output_dir}") | |
| except Exception as e: | |
| print(f"Failed to copy static files: {e}") | |
| try: | |
| with open(template_file_path, 'r', encoding='utf-8') as f: | |
| html_content = f.read() | |
| except Exception as e: | |
| print(f"Failed to read template file: {e}") | |
| return | |
| return static_dir | |
| def main(): | |
| """Main pipeline for generating project pages from research papers.""" | |
| parser = argparse.ArgumentParser(description='Paper2ProjectPage Generation Pipeline') | |
| parser.add_argument('--paper_path', type=str, required=True, help='Path to the research paper PDF') | |
| parser.add_argument('--model_name_t', type=str, default='4o', help='Text model name') | |
| parser.add_argument('--model_name_v', type=str, default='4o', help='Vision model name') | |
| parser.add_argument('--template_root', type=str, default="project_templates", help='Directory containing all templates') | |
| parser.add_argument('--template_dir', type=str, help='Directory of chosen template') | |
| parser.add_argument('--template_file', type=str, help='Path to a specific template file to use') | |
| parser.add_argument('--output_dir', type=str, default='generated_project_pages', help='Output directory for generated pages') | |
| parser.add_argument('--style_preference', type=str, default=None, help='Path to style preference JSON file') | |
| parser.add_argument('--tmp_dir', type=str, default='tmp', help='Temporary directory') | |
| parser.add_argument('--full_content_check_times', type=int, default='0', help='Temporary directory') | |
| parser.add_argument('--background_color', type=str, choices=['light', 'dark'], required=True, | |
| help='Background color of generated project page') | |
| parser.add_argument('--has_navigation', type=str, choices=['yes', 'no'], required=True, | |
| help='Is the generated project page has navigation') | |
| parser.add_argument('--has_hero_section', type=str, choices=['yes', 'no'], required=True, | |
| help='Is the generated project page has hero section') | |
| parser.add_argument('--title_color', type=str, choices=['pure', 'colorful'], required=True, | |
| help="Is the title's color of the project page is pure or colorful") | |
| parser.add_argument('--page_density', type=str, choices=['spacious', 'compact'], required=True, | |
| help="The overall spacing tightnessβamount of white space vs. information density") | |
| parser.add_argument('--image_layout', type=str, choices=['rotation', 'parallelism'], required=True, | |
| help="The dominant arrangement style for images.") | |
| parser.add_argument('--html_check_times', type=int, default='1', help='Temporary directory') | |
| parser.add_argument( | |
| '--resume', | |
| type=str, | |
| choices=['parse_pdf', 'generate_content','full_content_check', 'generate_html', 'html_check','modify_table','html_feedback'], | |
| default='parse_pdf', | |
| help="From which step to resume: 'parse_pdf', 'generate_content','full_content_check', 'generate_html', 'html_check','modify_table','html_feedback'", | |
| ) | |
| parser.add_argument('--human_input', type=str, default='1',choices=['0','1'] ,help='Human input for feedback') | |
| args = parser.parse_args() | |
| if not args.template_dir: | |
| template_requirement = { | |
| "background_color": args.background_color, | |
| "has_hero_section": args.has_hero_section, | |
| "Page density": args.page_density, | |
| "image_layout": args.image_layout, | |
| "has_navigation": args.has_navigation, | |
| "title_color": args.title_color | |
| } | |
| matched_template = matching(template_requirement) | |
| print('Below is names of the most matching 3 templates:') | |
| print(' '.join(matched_template)) | |
| template_name = input('Please choose one from them, you can just input the name of your favorite template') | |
| while template_name not in matched_template: | |
| template_name = input('Please input the correct name of your favorite template!!') | |
| args.template_dir = os.path.join(args.template_root, template_name) | |
| # Extract html path from root path | |
| if not args.template_file: | |
| html_finder_ = HtmlFinder() | |
| args.template_file = html_finder_.find_html(args.template_dir) | |
| # Extract paper name from path | |
| paper_name = args.paper_path.split('/')[-1].replace('.pdf', '') if '/' in args.paper_path else args.paper_path.replace('.pdf', '') | |
| args.paper_name = paper_name | |
| print(f"Starting Paper2ProjectPage generation for: {paper_name}") | |
| print(f"Paper path: {args.paper_path}") | |
| print(f"Models: {args.model_name_t} (text), {args.model_name_v} (vision)") | |
| start_time = time.time() | |
| total_input_tokens_t = 0 | |
| total_output_tokens_t = 0 | |
| total_input_tokens_v = 0 | |
| total_output_tokens_v = 0 | |
| # Create temporary directory | |
| os.makedirs(args.tmp_dir, exist_ok=True) | |
| try: | |
| # Get agent configurations | |
| agent_config_t = get_agent_config(args.model_name_t) | |
| agent_config_v = get_agent_config(args.model_name_v) | |
| # Step 1: Parse the research paper | |
| print("\n" + "="*50) | |
| print("STEP 1: Parsing Research Paper") | |
| print("="*50) | |
| raw_content_path = f'project_contents/{args.paper_name}_raw_content.json' | |
| if not os.path.exists(raw_content_path): | |
| print(f"Raw content does not exist at {raw_content_path}") | |
| input_token, output_token, raw_result, images, tables = parse_paper_for_project_page(args, agent_config_t) | |
| total_input_tokens_t += input_token | |
| total_output_tokens_t += output_token | |
| # Save parsed content | |
| raw_content_path, token_log_path = save_parsed_content(args, raw_result, images, tables, input_token, output_token) | |
| # Load parsed content | |
| with open(raw_content_path, 'r') as f: | |
| paper_content = json.load(f) | |
| else: | |
| print(f"Loading existing raw content from {raw_content_path}") | |
| with open(raw_content_path, 'r') as f: | |
| paper_content = json.load(f) | |
| # Load images and tables from the saved content | |
| images = paper_content.get('images', []) | |
| tables = paper_content.get('tables', []) | |
| token_log_path = raw_content_path.replace('_raw_content.json', '_parse_log.json') | |
| images = paper_content.get('images', []) | |
| tables = paper_content.get('tables', []) | |
| figures = { | |
| 'images': images, | |
| 'tables': tables | |
| } | |
| paper_content = paper_content.get('markdown_content', "") | |
| print("\n" + "="*50) | |
| print("STEP 2: Generate project page content") | |
| print("="*50) | |
| planner = ProjectPageContentPlanner(agent_config_t, args) | |
| figures_path = f'project_contents/{args.paper_name}_generated_filtered_figures.json' | |
| generated_section_path = f'project_contents/{args.paper_name}_generated_section.json' | |
| text_page_content_path = f'project_contents/{args.paper_name}_generated_text_content.json' | |
| generated_content_path = f'project_contents/{args.paper_name}_generated_full_content.json' | |
| if args.resume in ['parse_pdf','generate_content','full_content_check']: | |
| if args.resume != 'full_content_check': | |
| paper_content, figures, input_token, output_token = planner.filter_raw_content(paper_content, figures) | |
| total_input_tokens_t += input_token | |
| total_output_tokens_t += output_token | |
| generated_section, input_token, output_token = planner.section_generation(paper_content, figures) | |
| total_input_tokens_t += input_token | |
| total_output_tokens_t += output_token | |
| text_page_content, input_token, output_token = planner.text_content_generation(paper_content, figures, generated_section) | |
| total_input_tokens_t += input_token | |
| total_output_tokens_t += output_token | |
| else : | |
| print("Skipping content generation: filter_raw_content, section_generation, text_content_generation") | |
| print("Loading existing content from previous steps.") | |
| paper_content = filter_references(paper_content) | |
| with open(figures_path, 'r') as f: | |
| figures = json.load(f) | |
| with open(generated_section_path, 'r') as f: | |
| generated_section = json.load(f) | |
| with open(text_page_content_path, 'r') as f: | |
| text_page_content = json.load(f) | |
| generated_content, input_token, output_token = planner.full_content_generation(args, paper_content, figures, generated_section, text_page_content) | |
| total_input_tokens_t += input_token | |
| total_output_tokens_t += output_token | |
| print("\n" + "="*50) | |
| print("STEP 2.5: Copying Static Files") | |
| print("="*50) | |
| static_dir = copy_static_files(args.template_file, args.template_dir, args.output_dir, args.paper_name) | |
| else: | |
| print("Page content is already generated, loading existing content.") | |
| paper_content = filter_references(paper_content) | |
| with open(generated_section_path, 'r') as f: | |
| generated_section = json.load(f) | |
| with open(text_page_content_path, 'r') as f: | |
| text_page_content = json.load(f) | |
| with open(generated_content_path, 'r') as f: | |
| generated_content = json.load(f) | |
| static_dir = copy_static_files(args.template_file, args.template_dir, args.output_dir, args.paper_name) | |
| # static_dir = os.path.join(args.output_dir, args.paper_name, 'static') | |
| # Step 3: Generate HTML project page | |
| print("\n" + "="*50) | |
| print("STEP 3: Generating HTML Project Page") | |
| print("="*50) | |
| html_relative_path = os.path.relpath(args.template_file, args.template_dir) | |
| html_dir = '/'.join(html_relative_path.strip().split('/')[:-1]) | |
| html_generator = ProjectPageHTMLGenerator(agent_config_t,args) | |
| with open(args.template_file, 'r', encoding='utf-8') as file: | |
| html_template = file.read() | |
| # Generate HTML | |
| if args.resume != 'modify_table' and args.resume != 'html_feedback': | |
| # Create assets directory and copy images | |
| assets_dir = html_generator.create_assets_directory(args, html_dir, args.output_dir) | |
| # Generate complete HTML | |
| html_content, input_token, output_token = html_generator.generate_complete_html( | |
| args, generated_content, html_dir, html_template | |
| ) | |
| total_input_tokens_t += input_token | |
| total_output_tokens_t += output_token | |
| # Save HTML file | |
| html_file_path = os.path.join(args.output_dir, args.paper_name, html_dir, 'index_no_modify_table.html') | |
| with open(html_file_path,'w') as file: | |
| file.write(html_content) | |
| run_sync_screenshots(to_url(html_file_path), os.path.join(args.output_dir,args.paper_name, html_dir,'page_final_no_modify_table.png')) | |
| else: | |
| print(f"skip generate_html and html_check, load html from {os.path.join(args.output_dir,args.paper_name, html_dir,'index.html')}") | |
| assets_dir = os.path.join(args.output_dir, args.paper_name, html_dir,'assets') | |
| with open(os.path.join(args.output_dir,args.paper_name, html_dir,'index_no_modify_table.html'),'r') as file: | |
| html_content = file.read() | |
| if args.resume != 'html_feedback': | |
| html_content ,input_token,output_token = html_generator.modify_html_table(html_content,html_dir) | |
| total_input_tokens_t += input_token | |
| total_output_tokens_t += output_token | |
| html_file_path = os.path.join(args.output_dir, args.paper_name, html_dir, 'index_modify_table.html') | |
| with open(html_file_path,'w') as file: | |
| file.write(html_content) | |
| # html_file_path = html_generator.save_html_file(html_content, args, html_dir,args.output_dir) | |
| else: | |
| print("skipping modify_table,go to html_feedback") | |
| html_file_path = os.path.join(args.output_dir, args.paper_name, html_dir, 'index_modify_table.html') | |
| with open(html_file_path,'r') as file: | |
| html_content = file.read() | |
| print('-'*50) | |
| run_sync_screenshots(to_url(html_file_path), os.path.join(args.output_dir, args.paper_name, html_dir,'page_final.png')) | |
| if args.human_input == '1': | |
| human_feedback = input('Please view the final html in index.html,and image in page_final.png,If there are no problems, enter yes and press Enter.\n If there are any problems, please give me feedback directly.\n') | |
| while human_feedback.lower() != 'yes': | |
| html_content ,input_token,output_token = html_generator.modify_html_from_human_feedback(html_content,human_feedback) | |
| total_input_tokens_t += input_token | |
| total_output_tokens_t += output_token | |
| with open(os.path.join(args.output_dir, args.paper_name, html_dir, 'index.html'),'w') as file: | |
| file.write(html_content) | |
| run_sync_screenshots(to_url(os.path.join(args.output_dir, args.paper_name, html_dir, 'index.html')), os.path.join(args.output_dir, args.paper_name, html_dir,'page_final.png')) | |
| print('-'*50) | |
| human_feedback = input('Please view the final html in index.html,and image in page_final.png,If there are no problems, enter yes and press Enter. \n If there are any problems, please give me feedback directly.\n') | |
| html_file_path = html_generator.save_html_file(html_content, args, html_dir,args.output_dir) | |
| # Generate and save metadata | |
| metadata = html_generator.generate_metadata(generated_content, args) | |
| metadata_path = html_generator.save_metadata(metadata, args, args.output_dir) | |
| # Step 4: Finalize and save logs | |
| print("\n" + "="*50) | |
| print("STEP 4: Finalizing Generation") | |
| print("="*50) | |
| end_time = time.time() | |
| time_taken = end_time - start_time | |
| # Save generation log | |
| log_data = { | |
| 'paper_name': paper_name, | |
| 'paper_path': args.paper_path, | |
| 'models': { | |
| 'text_model': args.model_name_t, | |
| 'vision_model': args.model_name_v | |
| }, | |
| 'token_usage': { | |
| 'text_input_tokens': total_input_tokens_t, | |
| 'text_output_tokens': total_output_tokens_t, | |
| 'vision_input_tokens': total_input_tokens_v, | |
| 'vision_output_tokens': total_output_tokens_v | |
| }, | |
| 'generation_time': time_taken, | |
| 'output_files': { | |
| 'html_file': html_file_path, | |
| 'assets_dir': assets_dir, | |
| 'static_dir': static_dir, | |
| 'metadata_file': metadata_path | |
| }, | |
| 'content_files': { | |
| 'raw_content': raw_content_path, | |
| 'token_log': token_log_path | |
| } | |
| } | |
| log_path = f"{args.output_dir}/{args.paper_name}/generation_log.json" | |
| with open(log_path, 'w') as f: | |
| json.dump(log_data, f, indent=4) | |
| print(f"\nβ Paper2ProjectPage generation completed successfully!") | |
| print(f"π Output directory: {args.output_dir}/{args.paper_name}") | |
| print(f"π HTML file: {html_file_path}") | |
| print(f"π Assets directory: {assets_dir}") | |
| print(f"π¨ Static directory: {static_dir}") | |
| print(f"π Metadata file: {metadata_path}") | |
| print(f"β±οΈ Total time: {time_taken:.2f} seconds") | |
| print(f"π’ Token usage - Text: {total_input_tokens_t}β{total_output_tokens_t}, Vision: {total_input_tokens_v}β{total_output_tokens_v}") | |
| except Exception as e: | |
| print(f"\nβ Error during generation: {str(e)}") | |
| raise | |
| if __name__ == '__main__': | |
| main() |