Spaces:
Build error
Build error
| import streamlit as st | |
| from typing import List | |
| from langchain_community.chat_models import ChatOpenAI | |
| from langchain_core.messages import SystemMessage, HumanMessage | |
| from langchain.vectorstores import FAISS | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from utils.database import get_collection_documents, get_all_documents, get_embeddings_model, initialize_qa_system | |
| def generate_document_tags(content: str) -> List[str]: | |
| """Generate tags for a document using AI.""" | |
| try: | |
| llm = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo") | |
| prompt = """Analyze the following document content and generate relevant tags/keywords. | |
| Focus on key themes, topics, and important terminology. | |
| Return only the tags as a comma-separated list. | |
| Content: {content}""" | |
| response = llm.invoke([ | |
| SystemMessage(content="You are a document analysis assistant. Generate relevant tags as a comma-separated list only."), | |
| HumanMessage(content=prompt.format(content=content[:2000])) | |
| ]) | |
| # Extract content from the AI message | |
| tags_text = response.content | |
| # Split the comma-separated string into a list | |
| tags = [tag.strip() for tag in tags_text.split(',')] | |
| return tags | |
| except Exception as e: | |
| st.error(f"Error generating tags: {e}") | |
| return [] | |
| def initialize_chat_system(collection_id=None) -> bool: | |
| """Initialize chat system with documents.""" | |
| try: | |
| # Get documents based on collection or all documents | |
| documents = (get_collection_documents(st.session_state.db_conn, collection_id) | |
| if collection_id else get_all_documents(st.session_state.db_conn)) | |
| if not documents: | |
| st.error("No documents found.") | |
| return False | |
| with st.spinner("Processing documents..."): | |
| # Initialize embeddings | |
| embeddings = get_embeddings_model() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=50, | |
| length_function=len, | |
| ) | |
| # Process all documents | |
| all_chunks = [] | |
| for doc in documents: | |
| doc_chunks = text_splitter.split_text(doc['content']) | |
| chunks = [ | |
| { | |
| 'content': chunk, | |
| 'metadata': { | |
| 'source': doc['name'], | |
| 'document_id': doc['id'], | |
| 'collection_id': collection_id | |
| } | |
| } | |
| for chunk in doc_chunks | |
| ] | |
| all_chunks.extend(chunks) | |
| # Create vector store | |
| vector_store = FAISS.from_texts( | |
| [chunk['content'] for chunk in all_chunks], | |
| embeddings, | |
| [chunk['metadata'] for chunk in all_chunks] | |
| ) | |
| # Initialize QA system | |
| st.session_state.vector_store = vector_store | |
| st.session_state.qa_system = initialize_qa_system(vector_store) | |
| st.session_state.chat_ready = True | |
| return True | |
| except Exception as e: | |
| st.error(f"Error initializing chat system: {e}") | |
| return False |