Daksh Chaudhary
Imported re
07d2fc1 verified
import os
import json
import requests
import re
from io import BytesIO
from llama_index.llms.mistralai import MistralAI
def get_llm():
"""
This function now only prepares and returns the MistralAI client.
It securely gets the API key from environment variables.
"""
mistral_api_key = os.getenv("MISTRAL_API_KEY")
if not mistral_api_key:
raise ValueError("MISTRAL_API_KEY environment variable not set. Please set it before running the app.")
return MistralAI(api_key=mistral_api_key, model="mistral-medium-latest", timeout=240)
def download_pdf_from_url(url: str):
"""Downloads PDF content from a URL and returns it as a BytesIO stream."""
try:
response = requests.get(url, timeout=20)
response.raise_for_status()
return BytesIO(response.content)
except requests.exceptions.RequestException as e:
print(f"Error downloading {url}: {e}")
return None
def format_to_bibtex(citation_json_str: str, filename: str) -> str:
"""Formats a JSON string of citation data into a BibTeX entry."""
try:
# --- NEW CLEANING LOGIC ---
# Use regex to find the JSON object within the raw LLM response,
# even if it's wrapped in markdown code blocks.
json_match = re.search(r'\{.*\}', citation_json_str, re.DOTALL)
if not json_match:
# If no JSON object is found at all, raise an error.
raise ValueError("No valid JSON object found in the LLM response.")
# Extract the clean JSON string from the match
clean_json_str = json_match.group(0)
# --- END OF CLEANING LOGIC ---
# Now, load the cleaned string
data = json.loads(clean_json_str)
# --- The rest of the function is the same ---
match = re.search(r'(\d{4}\.\d{5})', filename)
arxiv_id = match.group(1) if match else "N/A"
title = data.get("title", "No Title Found")
authors = " and ".join(data.get("authors", ["N/A"]))
year = data.get("year", "N/A")
first_author_lastname = authors.split(' ')[-1].lower() if ' ' in authors else "unknown"
first_title_word = title.split(' ')[0].lower().strip(":") if ' ' in title else "untitled"
key = f"{first_author_lastname}{year}{first_title_word}"
bibtex_entry = f"""@article{{{key},
title = {{{title}}},
author = {{{authors}}},
year = {{{year}}},
journal = {{arXiv preprint arXiv:{arxiv_id}}}
}}"""
return bibtex_entry
# Add ValueError to the exceptions we can catch
except (json.JSONDecodeError, KeyError, AttributeError, ValueError) as e:
print(f"Error formatting BibTeX: {e}")
return "Could not generate BibTeX citation. The required data could not be extracted."