Spaces:

OrganizedProgrammers
/

FastAPI_Neo4j

Sleeping

App Files Files Community

FastAPI_Neo4j / api.py

adrienbrdne

Update api.py

fbf2452 verified 8 months ago

raw

history blame

12 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	from fastapi import FastAPI, HTTPException
	from neo4j import GraphDatabase, basic_auth
	import google.generativeai as genai
	import logging # Import logging module

	# --- Logging Configuration ---
	# Basic logger configuration to display INFO messages and above.
	# The format includes timestamp, log level, and message.
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.StreamHandler() # Display logs in the console (stderr by default)
	# You could add a logging.FileHandler("app.log") here to write to a file
	]
	)
	logger = logging.getLogger(__name__) # Create a logger instance for this module

	# --- Environment Variable Configuration ---
	NEO4J_URI = os.getenv("NEO4J_URI")
	NEO4J_USER = os.getenv("NEO4J_USER")
	NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

	# Validation of essential configurations
	if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
	logger.critical("CRITICAL ERROR: NEO4J_URI, NEO4J_USER, and NEO4J_PASSWORD environment variables must be set.")
	# In a real application, you might want to exit or prevent FastAPI from starting.
	# For now, we let the application try and fail at runtime if they are missing.

	# Initialize FastAPI application
	app = FastAPI(
	title="Arxiv to Neo4j Importer",
	description="API to fetch research paper data from Arxiv, summarize it with Gemini, and add it to Neo4j.",
	version="1.0.0"
	)

	# --- Gemini API Client Initialization ---
	gemini_model = None
	if GEMINI_API_KEY:
	try:
	genai.configure(api_key=GEMINI_API_KEY)
	gemini_model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20") # Specified model
	logger.info("Gemini API client initialized successfully.")
	except Exception as e:
	logger.warning(f"WARNING: Failed to initialize Gemini API client: {e}. Summary generation will be affected.")
	else:
	logger.warning("WARNING: GEMINI_API_KEY environment variable not set. Summary generation will be disabled.")

	# --- Utility Functions (Adapted from your script) ---

	def get_content(number: str, node_type: str) -> str:
	"""Fetches raw HTML content from Arxiv or other sources."""
	redirect_links = {
	"Patent": f"https://patents.google.com/patent/{number}/en",
	"ResearchPaper": f"https://arxiv.org/abs/{number}"
	}

	url = redirect_links.get(node_type)
	if not url:
	logger.warning(f"Unknown node type: {node_type} for number {number}")
	return ""

	try:
	response = requests.get(url, timeout=10) # Added a timeout
	response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
	return response.content.decode('utf-8', errors='replace').replace("\n", "")
	except requests.exceptions.RequestException as e:
	logger.error(f"Request error for {node_type} number: {number} at URL {url}: {e}")
	return ""
	except Exception as e:
	logger.error(f"An unexpected error occurred in get_content for {number}: {e}")
	return ""

	def extract_research_paper_arxiv(rp_number: str, node_type: str) -> dict:
	"""Extracts information from an Arxiv research paper and generates a summary."""
	raw_content = get_content(rp_number, node_type)

	rp_data = {
	"document": f"Arxiv {rp_number}", # ID for the paper
	"arxiv_id": rp_number,
	"title": "Error fetching content or content not found",
	"abstract": "Error fetching content or content not found",
	"summary": "Summary not generated" # Default summary
	}

	if not raw_content:
	logger.warning(f"No content fetched for Arxiv ID: {rp_number}")
	return rp_data # Returns default error data

	try:
	soup = BeautifulSoup(raw_content, 'html.parser')

	# Extract Title
	title_tag = soup.find('h1', class_='title')
	if title_tag and title_tag.find('span', class_='descriptor'):
	title_text_candidate = title_tag.find('span', class_='descriptor').next_sibling
	if title_text_candidate and isinstance(title_text_candidate, str):
	rp_data["title"] = title_text_candidate.strip()
	else:
	rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()
	elif title_tag : # Fallback if the span descriptor is not there but h1.title exists
	rp_data["title"] = title_tag.get_text(separator=" ", strip=True).replace("Title:", "").strip()


	# Extract Abstract
	abstract_tag = soup.find('blockquote', class_='abstract')
	if abstract_tag:
	abstract_text = abstract_tag.get_text(strip=True)
	if abstract_text.lower().startswith('abstract'): # Check if "abstract" (case-insensitive) is at the beginning
	# Find the first occurrence of ':' after "abstract" or just remove "abstract" prefix
	prefix_end = abstract_text.lower().find('abstract') + len('abstract')
	if prefix_end < len(abstract_text) and abstract_text[prefix_end] == ':':
	prefix_end += 1 # Include the colon in removal
	abstract_text = abstract_text[prefix_end:].strip()

	rp_data["abstract"] = abstract_text

	# Mark if title or abstract are still not found
	if rp_data["title"] == "Error fetching content or content not found" and not title_tag:
	rp_data["title"] = "Title not found on page"
	if rp_data["abstract"] == "Error fetching content or content not found" and not abstract_tag:
	rp_data["abstract"] = "Abstract not found on page"

	# Generate summary with Gemini API if available and abstract exists
	if gemini_model and rp_data["abstract"] and \
	not rp_data["abstract"].startswith("Error fetching content") and \
	not rp_data["abstract"].startswith("Abstract not found"):
	# English prompt for Gemini
	prompt = f"""You are a 3GPP standardization expert. Summarize the key information in the provided document in simple technical English relevant to identifying potential Key Issues.
	Focus on challenges, gaps, or novel aspects.
	Here is the document: <document>{rp_data['abstract']}<document>"""

	try:
	response = gemini_model.generate_content(prompt)
	rp_data["summary"] = response.text
	logger.info(f"Summary generated for Arxiv ID: {rp_number}")
	except Exception as e:
	logger.error(f"Error generating summary with Gemini for Arxiv ID {rp_number}: {e}")
	rp_data["summary"] = "Error generating summary (API failure)"
	elif not gemini_model:
	rp_data["summary"] = "Summary not generated (Gemini API client not available)"
	else:
	rp_data["summary"] = "Summary not generated (Abstract unavailable or problematic)"

	except Exception as e:
	logger.error(f"Error parsing content for Arxiv ID {rp_number}: {e}")

	return rp_data

	def add_nodes_to_neo4j(driver, data_list: list, node_label: str):
	"""Adds a list of nodes to Neo4j in a single transaction."""
	if not data_list:
	logger.warning("No data provided to add_nodes_to_neo4j.")
	return 0

	query = (
	f"UNWIND $data as properties "
	f"MERGE (n:{node_label} {{arxiv_id: properties.arxiv_id}}) " # Use MERGE for idempotency
	f"ON CREATE SET n = properties "
	f"ON MATCH SET n += properties" # Update properties if the node already exists
	)

	try:
	with driver.session(database="neo4j") as session: # Specify database if not default
	result = session.execute_write(lambda tx: tx.run(query, data=data_list).consume())
	nodes_created = result.counters.nodes_created

	if nodes_created > 0:
	logger.info(f"{nodes_created} new {node_label} node(s) added successfully.")

	summary = result.summary
	logger.info(f"MERGE operation for {node_label}: {summary.counters.nodes_created} created, {summary.counters.properties_set} properties affected.")

	return nodes_created # Return the number of nodes actually created
	except Exception as e:
	logger.error(f"Neo4j Error - Failed to add/update {node_label} nodes: {e}")
	raise HTTPException(status_code=500, detail=f"Neo4j database error: {e}")


	# --- FastAPI Endpoint ---

	@app.post("/add_research_paper/{arxiv_id}", status_code=201) # 201 Created for successful creation
	async def add_single_research_paper(arxiv_id: str):
	"""
	Fetches a research paper from Arxiv by its ID, extracts information,
	generates a summary, and adds/updates it as a 'ResearchPaper' node in Neo4j.
	"""
	node_type = "ResearchPaper"
	logger.info(f"Processing request for Arxiv ID: {arxiv_id}")

	if not NEO4J_URI or not NEO4J_USER or not NEO4J_PASSWORD:
	logger.error("Neo4j database connection details are not configured on the server.")
	raise HTTPException(status_code=500, detail="Neo4j database connection details are not configured on the server.")

	# Step 1: Extract paper data
	paper_data = extract_research_paper_arxiv(arxiv_id, node_type)

	if paper_data["title"].startswith("Error fetching content") or paper_data["title"] == "Title not found on page":
	logger.warning(f"Could not fetch or parse content for Arxiv ID {arxiv_id}. Title: {paper_data['title']}")
	raise HTTPException(status_code=404, detail=f"Could not fetch or parse content for Arxiv ID {arxiv_id}. Title: {paper_data['title']}")

	# Step 2: Add/Update in Neo4j
	driver_instance = None # Initialize for the finally block
	try:
	auth_token = basic_auth(NEO4J_USER, NEO4J_PASSWORD)
	driver_instance = GraphDatabase.driver(NEO4J_URI, auth=auth_token)
	driver_instance.verify_connectivity()
	logger.info("Successfully connected to Neo4j.")

	nodes_created_count = add_nodes_to_neo4j(driver_instance, [paper_data], node_type)

	if nodes_created_count > 0 :
	message = f"Research paper {arxiv_id} was successfully added to Neo4j."
	status_code_response = 201 # Created
	else:
	# If MERGE found an existing node and updated it, nodes_created_count will be 0.
	# This is considered a success (idempotency).
	message = f"Research paper {arxiv_id} was processed (potentially updated if it already existed)."
	status_code_response = 200 # OK (because no new creation, but operation successful)

	logger.info(message)
	# Note: FastAPI uses the status_code from the decorator or HTTPException.
	# This custom status_code_response is for the JSON body if needed, but the actual HTTP response status
	# will be 201 (from decorator) unless an HTTPException overrides it or we change the decorator based on logic.
	# For simplicity here, we'll return it in the body and let the decorator's 201 stand if no error.
	# A more advanced setup might change the response status dynamically.
	return {
	"message": message,
	"data": paper_data,
	"response_status_info": status_code_response
	}

	except HTTPException as e: # Re-raise HTTPExceptions
	logger.error(f"HTTPException during Neo4j operation for {arxiv_id}: {e.detail}")
	raise e
	except Exception as e:
	logger.error(f"An unexpected error occurred during Neo4j operation for {arxiv_id}: {e}", exc_info=True)
	raise HTTPException(status_code=500, detail=f"An unexpected server error occurred: {e}")
	finally:
	if driver_instance:
	driver_instance.close()
	logger.info("Neo4j connection closed.")