Spaces:
Runtime error
Runtime error
| """ | |
| Paper parsing module for ProjectPageAgent. | |
| Reuses the parsing capabilities from Paper2Poster. | |
| """ | |
| from ProjectPageAgent.parse_raw import parse_raw, gen_image_and_table | |
| from utils.wei_utils import get_agent_config | |
| import json | |
| import os | |
| import argparse | |
| def parse_paper_for_project_page(args, agent_config_t, version=2): | |
| """ | |
| Parse a research paper PDF and extract content for project page generation. | |
| Args: | |
| args: Command line arguments | |
| agent_config_t: Text model configuration | |
| version: Parser version to use | |
| Returns: | |
| tuple: (input_tokens, output_tokens, raw_result, images, tables) | |
| """ | |
| print("Step 1: Parsing the research paper...") | |
| # Add poster_path and poster_name attributes to args for compatibility with parse_raw | |
| if not hasattr(args, 'poster_path'): | |
| args.poster_path = args.paper_path | |
| if not hasattr(args, 'poster_name'): | |
| args.poster_name = args.paper_name | |
| # Parse the raw paper content | |
| input_token, output_token, raw_result = parse_raw(args, agent_config_t, version=version) | |
| # Extract images and tables | |
| _, _, images, tables = gen_image_and_table(args, raw_result) | |
| print(f"Parsing completed. Tokens: {input_token} -> {output_token}") | |
| print(f"Extracted {len(images)} images and {len(tables)} tables") | |
| return input_token, output_token, raw_result, images, tables | |
| def save_parsed_content(args, raw_result, images, tables, input_token, output_token): | |
| """ | |
| Save parsed content to files for later use. | |
| Args: | |
| args: Command line arguments | |
| raw_result: Parsed raw content | |
| images: Extracted images | |
| tables: Extracted tables | |
| input_token: Input token count | |
| output_token: Output token count | |
| """ | |
| # Save raw content | |
| os.makedirs('project_contents', exist_ok=True) | |
| raw_content_path = f'project_contents/{args.paper_name}_raw_content.json' | |
| # Convert raw_result to JSON format if needed | |
| if hasattr(raw_result, 'document'): | |
| # Extract text content from docling result | |
| raw_markdown = raw_result.document.export_to_markdown() | |
| content_json = { | |
| 'markdown_content': raw_markdown, | |
| 'images': images, | |
| 'tables': tables | |
| } | |
| else: | |
| content_json = raw_result | |
| with open(raw_content_path, 'w') as f: | |
| json.dump(content_json, f, indent=4) | |
| # Save token usage | |
| token_log = { | |
| 'parse_input_tokens': input_token, | |
| 'parse_output_tokens': output_token, | |
| 'total_images': len(images), | |
| 'total_tables': len(tables) | |
| } | |
| token_log_path = f'project_contents/{args.paper_name}_parse_log.json' | |
| with open(token_log_path, 'w') as f: | |
| json.dump(token_log, f, indent=4) | |
| print(f"Parsed content saved to {raw_content_path}") | |
| return raw_content_path, token_log_path |