RFP_Analyzer_Agent

Paused

App Files Files Community

RFP_Analyzer_Agent / backend.py

cryogenic22

Update backend.py

a35c160 verified about 1 year ago

raw

history blame contribute delete

8.42 kB

	# Import Dependencies (dependencies.py)
	import streamlit as st
	from langchain.chains import RetrievalQA
	from langchain_community.vectorstores import FAISS
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
	from transformers import pipeline
	import re
	import sqlite3
	from sqlite3 import Error
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import requests
	import pandas as pd
	from pydrive.auth import GoogleAuth
	from pydrive.drive import GoogleDrive
	from io import BytesIO
	from googleapiclient.discovery import build
	from googleapiclient.http import MediaIoBaseDownload
	from google.oauth2 import service_account
	import tempfile
	import os
	from langchain.llms import OpenAI # Import the OpenAI class
	from langchain.chat_models import ChatOpenAI # Import ChatOpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.agents import create_openai_tools_agent, AgentExecutor, Tool
	from langchain.prompts import (
	ChatPromptTemplate,
	MessagesPlaceholder,
	) # Import necessary classes


	# SQLite Database Functions (database.py)
	def create_connection(db_file):
	try:
	conn = sqlite3.connect(db_file)
	return conn
	except Error as e:
	st.error(f"Error: {e}")
	return None


	def create_tables(conn):
	try:
	sql_create_documents_table = """
	CREATE TABLE IF NOT EXISTS documents (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	name TEXT NOT NULL,
	content TEXT NOT NULL,
	upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	);
	"""

	sql_create_queries_table = """
	CREATE TABLE IF NOT EXISTS queries (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	query TEXT NOT NULL,
	response TEXT NOT NULL,
	document_id INTEGER,
	query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	FOREIGN KEY (document_id) REFERENCES documents (id)
	);
	"""

	sql_create_annotations_table = """
	CREATE TABLE IF NOT EXISTS annotations (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	document_id INTEGER NOT NULL,
	annotation TEXT NOT NULL,
	page_number INTEGER,
	annotation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	FOREIGN KEY (document_id) REFERENCES documents (id)
	);
	"""

	c = conn.cursor()
	c.execute(sql_create_documents_table)
	c.execute(sql_create_queries_table)
	c.execute(sql_create_annotations_table)
	except Error as e:
	st.error(f"Error: {e}")


	# FAISS Initialization (faiss_initialization.py)
	def initialize_faiss(embeddings, documents, document_names):
	try:
	vector_store = FAISS.from_texts(
	documents,
	embeddings,
	metadatas=[{"source": name} for name in document_names],
	)
	return vector_store
	except Exception as e:
	st.error(f"Error initializing FAISS: {e}")
	return None


	# Document Upload & Parsing Functions (document_parsing.py)
	@st.cache_data
	def upload_and_parse_documents(documents):
	all_texts = []
	document_names = []
	document_pages = []
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
	for doc in documents:
	try:
	if doc.name in document_names:
	st.warning(
	f"Duplicate file name detected: {doc.name}. This file will be ignored.",
	icon="⚠️",
	)
	continue # Skip to the next file

	# Create a temporary file
	with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
	tmp_file.write(doc.read())
	tmp_file_path = tmp_file.name

	loader = PyPDFLoader(tmp_file_path)
	pages = loader.load()
	document_names.append(doc.name)
	page_contents = []
	for page in pages:
	chunks = text_splitter.split_text(page.page_content)
	all_texts.extend(chunks)
	page_contents.append(page.page_content)
	document_pages.append(page_contents)

	# Remove the temporary file
	os.remove(tmp_file_path)

	except Exception as e:
	st.error(f"Error parsing document {doc.name}: {e}")
	return all_texts, document_names, document_pages


	@st.cache_data
	def parse_pdf_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	with open("temp.pdf", "wb") as f:
	f.write(response.content)
	loader = PyPDFLoader("temp.pdf")
	pages = loader.load()
	all_texts = []
	document_name = url.split("/")[-1]
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, chunk_overlap=100
	)
	for page in pages:
	chunks = text_splitter.split_text(page.page_content)
	all_texts.extend(chunks)
	return all_texts, document_name
	except requests.exceptions.RequestException as e:
	st.error(f"Failed to download PDF from URL: {e}")
	return None, None
	except Exception as e:
	st.error(f"Error parsing PDF from URL: {e}")
	return None, None


	@st.cache_data
	def parse_pdf_from_google_drive(file_id):
	try:
	# Authenticate and create the drive service
	credentials = service_account.Credentials.from_service_account_info(
	st.secrets["gdrive_service_account"],
	scopes=["https://www.googleapis.com/auth/drive"],
	)
	service = build("drive", "v3", credentials=credentials)
	request = service.files().get_media(fileId=file_id)
	fh = BytesIO()
	downloader = MediaIoBaseDownload(fh, request)
	done = False
	while not done:
	status, done = downloader.next_chunk()
	fh.seek(0)
	with open("temp_drive.pdf", "wb") as f:
	f.write(fh.read())
	loader = PyPDFLoader("temp_drive.pdf")
	pages = loader.load()
	all_texts = []
	document_name = f"GoogleDrive_{file_id}.pdf"
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, chunk_overlap=100
	)
	for page in pages:
	chunks = text_splitter.split_text(page.page_content)
	all_texts.extend(chunks)
	return all_texts, document_name
	except Exception as e:
	st.error(f"Error downloading PDF from Google Drive: {e}")
	return None, None


	# Embeddings for Semantic Search (embeddings.py)
	@st.cache_resource
	def get_embeddings_model():
	try:
	model_name = "sentence-transformers/all-MiniLM-L6-v2"
	embeddings = HuggingFaceEmbeddings(model_name=model_name)
	return embeddings
	except Exception as e:
	st.error(f"Error loading embeddings model: {e}")
	return None


	# QA System Initialization (qa_system.py)


	@st.cache_resource
	def initialize_qa_system(vector_store):
	"""Initialize QA system with proper chat handling."""
	try:
	llm = ChatOpenAI(
	temperature=0.5,
	model_name="gpt-4",
	api_key=os.environ.get("OPENAI_API_KEY")
	)

	# Create a more basic prompt template
	prompt = ChatPromptTemplate.from_messages([
	("system", """You are an expert consultant specializing in analyzing Request for Proposal (RFP) documents.
	Your goal is to provide clear, accurate responses based on the provided context.
	Start with a direct answer and organize additional details under relevant headers."""),
	("human", "{input}")
	])

	# Create the retriever chain
	retriever = vector_store.as_retriever(
	search_type="similarity",
	search_kwargs={"k": 3}
	)

	chain = (
	{
	"input": RunnablePassthrough()
	}
	\| {"input": lambda x: x["input"], "docs": retriever}
	\| {
	"input": lambda x: x["input"],
	"context": lambda x: "\n\n".join([doc.page_content for doc in x["docs"]])
	}
	\| prompt
	\| llm
	)

	return chain

	except Exception as e:
	st.error(f"Error initializing QA system: {e}")
	return None