Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from fastapi import FastAPI, HTTPException | |
| from neo4j import GraphDatabase, basic_auth | |
| import google.generativeai as genai | |
| import logging # Import logging module | |
| # --- Logging Configuration --- | |
| # Basic logger configuration to display INFO messages and above. | |
| # The format includes timestamp, log level, and message. | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler() # Display logs in the console (stderr by default) | |
| # You could add a logging.FileHandler("app.log") here to write to a file | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) # Create a logger instance for this module | |
| # --- Environment Variable Configuration --- | |
| NEO4J_URI = os.getenv("NEO4J_URI") | |
| NEO4J_USER = os.getenv("NEO4J_USER") | |
| NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD") | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| # Validation of essential configurations | |
| if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD: | |
| logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.") | |
| # In a real application, you might want to exit or prevent FastAPI from starting. | |
| # For now, we let the application try and fail at runtime if they are missing. | |
| # Initialize FastAPI application | |
| app = FastAPI( | |
| title="Arxiv to Neo4j Importer", | |
| description="API to fetch research paper data from Arxiv, summarize it with Gemini, and add it to Neo4j.", | |
| version="1.0.0" | |
| ) | |
| # --- Gemini API Client Initialization --- | |
| gemini_model = None | |
| if GEMINI_API_KEY: | |
| try: | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| gemini_model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20") # Specified model | |
| logger.info("Gemini API client initialized successfully.") | |
| except Exception as e: | |
| logger.warning(f"WARNING: Failed to initialize Gemini API client: {e}. Summary generation will be affected.") | |
| else: | |
| logger.warning("WARNING: GEMINI_API_KEY environment variable not set. Summary generation will be disabled.") | |
| # --- Utility Functions (Adapted from your script) --- | |
| def get_content(number: str, node_type: str) -> str: | |
| """Fetches raw HTML content from Arxiv or other sources.""" | |
| redirect_links = { | |
| "Patent": f"https://patents.google.com/patent/{number}/en", | |
| "ResearchPaper": f"https://arxiv.org/abs/{number}" | |
| } | |
| url = redirect_links.get(node_type) | |
| if not url: | |
| logger.warning(f"Unknown node type: {node_type} for number {number}") | |
| return "" | |
| try: | |
| response = requests.get(url, timeout=10) # Added a timeout | |
| response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX) | |
| return response.content.decode('utf-8', errors='replace').replace("\n", "") | |
| except requests.exceptions.RequestException as e: | |
| logger.error(f"Request error for {node_type} number: {number} at URL {url}: {e}") | |
| return "" | |
| except Exception as e: | |
| logger.error(f"An unexpected error occurred in get_content for {number}: {e}") | |
| return "" | |
| def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict: | |
| """Extracts information from an Arxiv research paper and generates a summary.""" | |
| raw_content = get_content(rp_number, node_type) | |
| rp_data = { | |
| "document": f"Arxiv {rp_number}", # ID for the paper | |
| "arxiv_id": rp_number, | |
| "title": "Error fetching content or content not found", | |
| "abstract": "Error fetching content or content not found", | |
| "summary": "Summary not generated" # Default summary | |
| } | |
| if not raw_content: | |
| logger.warning(f"No content fetched for Arxiv ID: {rp_number}") | |
| return rp_data # Returns default error data | |
| try: | |
| soup = BeautifulSoup(raw_content, 'html.parser') | |
| # Extract Title | |
| title_tag = soup.find('h1', class_='title') | |
| if title_tag and title_tag.find('span', class_='descriptor'): | |
| title_text_candidate = title_tag.find('span', class_='descriptor').next_sibling | |
| if title_text_candidate and isinstance(title_text_candidate, str): | |
| rp_data["title"] = title_text_candidate.strip() | |
| else: | |
| rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip() | |
| elif title_tag : # Fallback if the span descriptor is not there but h1.title exists | |
| rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip() | |
| # Extract Abstract | |
| abstract_tag = soup.find('blockquote', class_='abstract') | |
| if abstract_tag: | |
| abstract_text = abstract_tag.get_text(strip=True) | |
| if abstract_text.lower().startswith('abstract'): # Check if "abstract" (case-insensitive) is at the beginning | |
| # Find the first occurrence of ':' after "abstract" or just remove "abstract" prefix | |
| prefix_end = abstract_text.lower().find('abstract') + len('abstract') | |
| if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':': | |
| prefix_end += 1 # Include the colon in removal | |
| abstract_text = abstract_text[prefix_end:].strip() | |
| rp_data["abstract"] = abstract_text | |
| # Mark if title or abstract are still not found | |
| if rp_data["title"] == "Error fetching content or content not found" and not title_tag: | |
| rp_data["title"] = "Title not found on page" | |
| if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag: | |
| rp_data["abstract"] = "Abstract not found on page" | |
| # Generate summary with Gemini API if available and abstract exists | |
| if gemini_model and rp_data["abstract"] and \ | |
| not rp_data["abstract"].startswith("Error fetching content") and \ | |
| not rp_data["abstract"].startswith("Abstract not found"): | |
| # English prompt for Gemini | |
| prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues. | |
| Focus on challenges, gaps, or novel aspects. | |
| Here is the document: <document>{rp_data['abstract']}<document>""" | |
| try: | |
| response = gemini_model.generate_content(prompt) | |
| rp_data["summary"] = response.text | |
| logger.info(f"Summary generated for Arxiv ID: {rp_number}") | |
| except Exception as e: | |
| logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}") | |
| rp_data["summary"] = "Error generating summary (API failure)" | |
| elif not gemini_model: | |
| rp_data["summary"] = "Summary not generated (Gemini API client not available)" | |
| else: | |
| rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)" | |
| except Exception as e: | |
| logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}") | |
| return rp_data | |
| def add_nodes_to_neo4j(driver, data_list: list, node_label: str): | |
| """Adds a list of nodes to Neo4j in a single transaction.""" | |
| if not data_list: | |
| logger.warning("No data provided to add_nodes_to_neo4j.") | |
| return 0 | |
| query = ( | |
| f"UNWIND $data as properties " | |
| f"MERGE (n:{node_label} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency | |
| f"ON CREATE SET n = properties " | |
| f"ON MATCH SET n += properties" # Update properties if the node already exists | |
| ) | |
| try: | |
| with driver.session(database="neo4j") as session: # Specify database if not default | |
| result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume()) | |
| nodes_created = result.counters.nodes_created | |
| if nodes_created > 0: | |
| logger.info(f"{nodes_created} new {node_label} node(s) added successfully.") | |
| summary = result.summary | |
| logger.info(f"MERGE operation for {node_label}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.") | |
| return nodes_created # Return the number of nodes actually created | |
| except Exception as e: | |
| logger.error(f"Neo4j Error - Failed to add/update {node_label} nodes: {e}") | |
| raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}") | |
| # --- FastAPI Endpoint --- | |
| # 201 Created for successful creation | |
| async def add_single_research_paper(arxiv_id: str): | |
| """ | |
| Fetches a research paper from Arxiv by its ID, extracts information, | |
| generates a summary, and adds/updates it as a 'ResearchPaper' node in Neo4j. | |
| """ | |
| node_type = "ResearchPaper" | |
| logger.info(f"Processing request for Arxiv ID: {arxiv_id}") | |
| if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD: | |
| logger.error("Neo4j database connection details are not configured on the server.") | |
| raise HTTPException(status_code=500, detail="Neo4j database connection details are not configured on the server.") | |
| # Step 1: Extract paper data | |
| paper_data = extract_research_paper_arxiv(arxiv_id, node_type) | |
| if paper_data["title"].startswith("Error fetching content") or paper_data["title"] == "Title not found on page": | |
| logger.warning(f"Could not fetch or parse content for Arxiv ID {arxiv_id}. Title: {paper_data['title']}") | |
| raise HTTPException(status_code=404, detail=f"Could not fetch or parse content for Arxiv ID {arxiv_id}. Title: {paper_data['title']}") | |
| # Step 2: Add/Update in Neo4j | |
| driver_instance = None # Initialize for the finally block | |
| try: | |
| auth_token = basic_auth(NEO4J_USER, NEO4J_PASSWORD) | |
| driver_instance = GraphDatabase.driver(NEO4J_URI, auth=auth_token) | |
| driver_instance.verify_connectivity() | |
| logger.info("Successfully connected to Neo4j.") | |
| nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type) | |
| if nodes_created_count > 0 : | |
| message = f"Research paper {arxiv_id} was successfully added to Neo4j." | |
| status_code_response = 201 # Created | |
| else: | |
| # If MERGE found an existing node and updated it, nodes_created_count will be 0. | |
| # This is considered a success (idempotency). | |
| message = f"Research paper {arxiv_id} was processed (potentially updated if it already existed)." | |
| status_code_response = 200 # OK (because no new creation, but operation successful) | |
| logger.info(message) | |
| # Note: FastAPI uses the status_code from the decorator or HTTPException. | |
| # This custom status_code_response is for the JSON body if needed, but the actual HTTP response status | |
| # will be 201 (from decorator) unless an HTTPException overrides it or we change the decorator based on logic. | |
| # For simplicity here, we'll return it in the body and let the decorator's 201 stand if no error. | |
| # A more advanced setup might change the response status dynamically. | |
| return { | |
| "message": message, | |
| "data": paper_data, | |
| "response_status_info": status_code_response | |
| } | |
| except HTTPException as e: # Re-raise HTTPExceptions | |
| logger.error(f"HTTPException during Neo4j operation for {arxiv_id}: {e.detail}") | |
| raise e | |
| except Exception as e: | |
| logger.error(f"An unexpected error occurred during Neo4j operation for {arxiv_id}: {e}", exc_info=True) | |
| raise HTTPException(status_code=500, detail=f"An unexpected server error occurred: {e}") | |
| finally: | |
| if driver_instance: | |
| driver_instance.close() | |
| logger.info("Neo4j connection closed.") |