import streamlit as st from typing import List from langchain_community.chat_models import ChatOpenAI from langchain_core.messages import SystemMessage, HumanMessage from langchain.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from utils.database import get_collection_documents, get_all_documents, get_embeddings_model, initialize_qa_system def generate_document_tags(content: str) -> List[str]: """Generate tags for a document using AI.""" try: llm = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo") prompt = """Analyze the following document content and generate relevant tags/keywords. Focus on key themes, topics, and important terminology. Return only the tags as a comma-separated list. Content: {content}""" response = llm.invoke([ SystemMessage(content="You are a document analysis assistant. Generate relevant tags as a comma-separated list only."), HumanMessage(content=prompt.format(content=content[:2000])) ]) # Extract content from the AI message tags_text = response.content # Split the comma-separated string into a list tags = [tag.strip() for tag in tags_text.split(',')] return tags except Exception as e: st.error(f"Error generating tags: {e}") return [] def initialize_chat_system(collection_id=None) -> bool: """Initialize chat system with documents.""" try: # Get documents based on collection or all documents documents = (get_collection_documents(st.session_state.db_conn, collection_id) if collection_id else get_all_documents(st.session_state.db_conn)) if not documents: st.error("No documents found.") return False with st.spinner("Processing documents..."): # Initialize embeddings embeddings = get_embeddings_model() text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, length_function=len, ) # Process all documents all_chunks = [] for doc in documents: doc_chunks = text_splitter.split_text(doc['content']) chunks = [ { 'content': chunk, 'metadata': { 'source': doc['name'], 'document_id': doc['id'], 'collection_id': collection_id } } for chunk in doc_chunks ] all_chunks.extend(chunks) # Create vector store vector_store = FAISS.from_texts( [chunk['content'] for chunk in all_chunks], embeddings, [chunk['metadata'] for chunk in all_chunks] ) # Initialize QA system st.session_state.vector_store = vector_store st.session_state.qa_system = initialize_qa_system(vector_store) st.session_state.chat_ready = True return True except Exception as e: st.error(f"Error initializing chat system: {e}") return False