contract-intelligence / app /main.py.new
Surajkumaar's picture
Upload 2898 files
a488104 verified
# app/main.py
import os
import uuid
import asyncio
import json
import pathlib
import ssl
import socket
import certifi
import requests
import urllib3
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from fastapi import FastAPI, UploadFile, File, BackgroundTasks, HTTPException
from fastapi.responses import JSONResponse
from pypdf import PdfReader
from pydantic import BaseModel
import motor.motor_asyncio
import httpx
# Disable SSL warnings for requests
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Load environment variables from .env file if it exists
env_path = pathlib.Path(__file__).parent.parent / '.env'
if env_path.exists():
print(f"Loading environment variables from {env_path}")
with open(env_path) as f:
for line in f:
line = line.strip()
if not line or line.startswith('#') or '=' not in line:
continue
key, value = line.split('=', 1)
os.environ[key] = value
# ----------------- Configuration -----------------
MONGO_URL = "mongodb://localhost:27017"
DB_NAME = "contracts_db"
COLLECTION_NAME = "contracts"
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # set in .env
if not OPENROUTER_API_KEY:
print("WARNING: OPENROUTER_API_KEY environment variable is not set. API calls will fail.")
MISTRAL_MODEL = "mistralai/mistral-7b-instruct:free" # free model
UPLOAD_DIR = "uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)
# ----------------- MongoDB Client -----------------
client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_URL)
db = client[DB_NAME]
contracts_collection = db[COLLECTION_NAME]
# ----------------- FastAPI App -----------------
app = FastAPI(title="Contract Intelligence API")
# ----------------- Models -----------------
class ContractStatus(BaseModel):
status: str
progress: int
score: int | None = None
error: str | None = None
# ----------------- Helper Functions -----------------
async def resolve_hostname_to_ip(hostname: str) -> str:
"""Try to resolve hostname to IP address using different methods"""
try:
# Try standard resolution first
ip = socket.gethostbyname(hostname)
return ip
except socket.gaierror:
# If that fails, try some known IP addresses for common services
known_ips = {
"openrouter.ai": "104.18.6.192",
"api.openrouter.ai": "104.18.7.192",
"httpbin.org": "34.205.4.79"
}
return known_ips.get(hostname, hostname)
async def test_network_connectivity() -> bool:
"""Test basic network connectivity with multiple fallbacks"""
test_urls = [
"https://httpbin.org/get",
"https://8.8.8.8", # Google DNS
"https://1.1.1.1" # Cloudflare DNS
]
for url in test_urls:
try:
async with httpx.AsyncClient(timeout=10, verify=False) as client:
response = await client.get(url)
if response.status_code in [200, 301, 302]:
return True
except Exception:
continue
return False
async def extract_text_from_pdf(file_path: str) -> str:
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
async def query_mistral_llm_requests_fallback(contract_text: str) -> dict:
"""
Fallback function using requests library with aggressive SSL bypass
"""
if not OPENROUTER_API_KEY:
raise ValueError("OPENROUTER_API_KEY environment variable is not set.")
system_prompt = "You are a helpful AI assistant that extracts structured information from contracts. Return your response as valid JSON."
user_prompt = f"Extract the following contract details as JSON with keys: parties, financials, payment_terms, sla, contacts. If any field is missing, set it as null. Additionally, extract ANY other important fields or information you find in the contract that doesn't fit into these categories and include them under an 'additional_fields' key. Contract text:\n{contract_text}"
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
"User-Agent": "ContractIntelligence/1.0"
}
payload = {
"model": MISTRAL_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"temperature": 0.3,
"max_tokens": 1000
}
# Configure requests session with SSL bypass
session = requests.Session()
session.verify = False # Disable SSL verification completely
# Add retry strategy
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
# Create custom SSL context that's more permissive
import ssl
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
endpoints = [
"https://openrouter.ai/api/v1/chat/completions",
"https://api.openrouter.ai/api/v1/chat/completions"
]
for endpoint in endpoints:
try:
print(f"Requests fallback: trying {endpoint}")
response = session.post(
endpoint,
json=payload,
headers=headers,
timeout=60,
verify=False
)
print(f"Requests response status: {response.status_code}")
if response.status_code == 200:
data = response.json()
if "choices" in data and len(data["choices"]) > 0:
text_output = data["choices"][0]["message"]["content"]
print("✅ Requests fallback successful!")
try:
extracted = json.loads(text_output)
return extracted
except json.JSONDecodeError:
return {
"parties": None,
"financials": None,
"payment_terms": None,
"sla": None,
"contacts": None,
"additional_fields": None
}
else:
print(f"Requests error: {response.status_code} - {response.text}")
except Exception as e:
print(f"Requests exception for {endpoint}: {str(e)}")
continue
raise ValueError("All requests fallback attempts failed")
async def query_mistral_llm(contract_text: str) -> dict:
"""
Send contract text to Mistral model via OpenRouter API with comprehensive fallbacks.
Returns structured JSON fields.
"""
# Check if API key is available
if not OPENROUTER_API_KEY:
raise ValueError("OPENROUTER_API_KEY environment variable is not set. Please set it before making API calls.")
system_prompt = "You are a helpful AI assistant that extracts structured information from contracts. Return your response as valid JSON."
user_prompt = f"Extract the following contract details as JSON with keys: parties, financials, payment_terms, sla, contacts. If any field is missing, set it as null. Additionally, extract ANY other important fields or information you find in the contract that doesn't fit into these categories and include them under an 'additional_fields' key. Contract text:\n{contract_text}"
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
"HTTP-Referer": "http://localhost:8000",
"X-Title": "Contract Intelligence API",
"User-Agent": "ContractIntelligence/1.0"
}
payload = {
"model": MISTRAL_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"temperature": 0.3,
"max_tokens": 1000
}
# Multiple endpoint strategies with DNS resolution fallbacks
endpoint_strategies = [
# Strategy 1: Standard hostnames
{
"name": "Standard OpenRouter endpoint",
"url": "https://openrouter.ai/api/v1/chat/completions"
},
{
"name": "Alternative OpenRouter endpoint",
"url": "https://api.openrouter.ai/api/v1/chat/completions"
},
# Strategy 2: Direct IP addresses (bypassing DNS)
{
"name": "Direct IP for openrouter.ai",
"url": "https://104.18.6.192/api/v1/chat/completions",
"headers": {**headers, "Host": "openrouter.ai"}
},
{
"name": "Direct IP for api.openrouter.ai",
"url": "https://104.18.7.192/api/v1/chat/completions",
"headers": {**headers, "Host": "api.openrouter.ai"}
}
]
# Different client configurations to try
client_configs = [
{
"name": "Standard secure",
"timeout": httpx.Timeout(60.0, connect=30.0),
"verify": True,
"follow_redirects": True
},
{
"name": "Extended timeout",
"timeout": httpx.Timeout(120.0, connect=60.0),
"verify": True,
"follow_redirects": True
},
{
"name": "No SSL verification",
"timeout": httpx.Timeout(60.0, connect=30.0),
"verify": False,
"follow_redirects": True
},
{
"name": "Basic connection",
"timeout": httpx.Timeout(30.0, connect=15.0),
"verify": False,
"follow_redirects": False
}
]
last_error = None
for strategy in endpoint_strategies:
endpoint = strategy["url"]
endpoint_headers = strategy.get("headers", headers)
print(f"Trying strategy: {strategy['name']} - {endpoint}")
for config in client_configs:
try:
print(f" Using config: {config['name']}")
async with httpx.AsyncClient(**{k: v for k, v in config.items() if k != 'name'}) as client:
try:
response = await client.post(
endpoint,
json=payload,
headers=endpoint_headers
)
print(f" Response status: {response.status_code}")
response.raise_for_status()
data = response.json()
# Parse response according to the OpenRouter API format
if "choices" in data and len(data["choices"]) > 0:
text_output = data["choices"][0]["message"]["content"]
print(f"✅ Success with {strategy['name']}")
# Try to parse JSON from response
try:
extracted = json.loads(text_output)
return extracted
except json.JSONDecodeError:
print("Warning: Could not parse LLM response as JSON, returning default structure")
return {
"parties": None,
"financials": None,
"payment_terms": None,
"sla": None,
"contacts": None,
"additional_fields": None
}
else:
last_error = ValueError(f"Unexpected API response format: {data}")
except httpx.HTTPStatusError as e:
last_error = ValueError(f"HTTP {e.response.status_code}: {e.response.text}")
print(f" HTTP Error: {e.response.status_code}")
except httpx.ConnectError as e:
last_error = ValueError(f"Connection error: {str(e)}")
print(f" Connection Error: {str(e)}")
except httpx.TimeoutException as e:
last_error = ValueError(f"Timeout error: {str(e)}")
print(f" Timeout Error: {str(e)}")
except Exception as e:
last_error = ValueError(f"Request error: {str(e)}")
print(f" Request Error: {str(e)}")
except Exception as e:
last_error = ValueError(f"Client creation error: {str(e)}")
print(f" Client Error: {str(e)}")
# If all httpx attempts failed, try requests fallback
if last_error:
print("🔄 Trying requests library fallback...")
try:
return await query_mistral_llm_requests_fallback(contract_text)
except Exception as fallback_error:
print(f"Requests fallback also failed: {str(fallback_error)}")
# Provide a helpful error message based on the original httpx errors
error_msg = str(last_error)
if "getaddrinfo failed" in error_msg or "Name or service not known" in error_msg:
raise ValueError("DNS resolution failed. This could be due to: 1) No internet connection, 2) DNS server issues, 3) Firewall blocking requests. Please check your network settings.")
elif "SSL" in error_msg.upper() or "certificate" in error_msg.lower() or "handshake" in error_msg.lower():
raise ValueError("SSL/TLS connection failed. This might be due to corporate firewall or security settings. All SSL bypass attempts failed.")
elif "timeout" in error_msg.lower():
raise ValueError("Connection timeout. Please check your internet speed and try again.")
else:
raise ValueError(f"All connection attempts failed. HTTPX error: {error_msg}. Requests error: {str(fallback_error)}")
# This should never be reached
raise ValueError("All API connection attempts failed without specific error")
async def process_contract(contract_id: str, file_path: str):
"""
Background task: extract PDF text, query LLM, update MongoDB
"""
try:
await contracts_collection.update_one(
{"_id": contract_id},
{"$set": {"status": "processing", "progress": 10}}
)
text = await extract_text_from_pdf(file_path)
await contracts_collection.update_one(
{"_id": contract_id},
{"$set": {"progress": 30}}
)
if not OPENROUTER_API_KEY:
# If API key is missing, don't extract any data
await contracts_collection.update_one(
{"_id": contract_id},
{"$set": {
"status": "completed",
"progress": 100,
"extracted_data": None,
"error": "No API key provided. Only PDF storage is available."
}}
)
return
# Test network connectivity first
await contracts_collection.update_one(
{"_id": contract_id},
{"$set": {"progress": 40}}
)
network_ok = await test_network_connectivity()
if not network_ok:
await contracts_collection.update_one(
{"_id": contract_id},
{"$set": {
"status": "error",
"progress": 100,
"error": "No internet connectivity detected. Please check your network connection."
}}
)
return
await contracts_collection.update_one(
{"_id": contract_id},
{"$set": {"progress": 50}}
)
# If API key is available and network is working, proceed with extraction
extracted_data = await query_mistral_llm(text)
await contracts_collection.update_one(
{"_id": contract_id},
{"$set": {
"status": "completed",
"progress": 100,
"extracted_data": extracted_data
}}
)
except Exception as e:
error_message = str(e)
# Provide more user-friendly error messages
if "getaddrinfo failed" in error_message:
error_message = "DNS resolution failed. Please check your internet connection and DNS settings."
elif "SSL" in error_message.upper():
error_message = "SSL connection failed. This might be due to firewall or network security settings."
elif "timeout" in error_message.lower():
error_message = "Connection timeout. Please check your internet connection."
await contracts_collection.update_one(
{"_id": contract_id},
{"$set": {
"status": "error",
"progress": 100,
"error": error_message
}}
)
# ----------------- Routes -----------------
@app.post("/contracts/upload")
async def upload_contract(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
if not file.filename.endswith(".pdf"):
raise HTTPException(status_code=400, detail="Only PDF files are supported")
contract_id = str(uuid.uuid4())
file_path = os.path.join(UPLOAD_DIR, f"{contract_id}.pdf")
with open(file_path, "wb") as f:
f.write(await file.read())
contract_doc = {
"_id": contract_id,
"filename": file.filename,
"status": "uploaded",
"progress": 0,
"extracted_data": None,
"error": None
}
await contracts_collection.insert_one(contract_doc)
# Launch background processing
background_tasks.add_task(process_contract, contract_id, file_path)
return {"contract_id": contract_id}
@app.get("/contracts/{contract_id}/status", response_model=ContractStatus)
async def get_contract_status(contract_id: str):
contract = await contracts_collection.find_one({"_id": contract_id})
if not contract:
raise HTTPException(status_code=404, detail="Contract not found")
return ContractStatus(
status=contract.get("status"),
progress=contract.get("progress"),
score=None,
error=contract.get("error")
)
@app.get("/contracts/{contract_id}")
async def get_contract_data(contract_id: str):
contract = await contracts_collection.find_one({"_id": contract_id})
if not contract:
raise HTTPException(status_code=404, detail="Contract not found")
return contract.get("extracted_data", {})