Spaces:
Sleeping
Sleeping
| from utilities.constants import ( | |
| CHUNKING_STRATEGY_TABLE_AWARE, | |
| CHUNKING_STRATEGY_SECTION_BASED, | |
| CHUNKING_STRATEGY_SEMANTIC | |
| ) | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.docstore.document import Document | |
| from langchain_community.vectorstores import Qdrant | |
| from langchain_openai.embeddings import OpenAIEmbeddings | |
| import numpy as np | |
| import pdfplumber | |
| import re | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import tiktoken | |
| from utilities.debugger import dprint | |
| def create_vector_store(app_state, model_run_state, **kwargs): | |
| for key, value in kwargs.items(): | |
| if hasattr(model_run_state, key): | |
| setattr(model_run_state, key, value) | |
| else: | |
| print(f"Warning: {key} is not an attribute of the state object") | |
| # Rest of your create_vector_store logic | |
| dprint(app_state, f"Chunk size after update: {model_run_state.chunk_size}") | |
| create_chunked_documents(app_state, model_run_state) | |
| qdrant_vectorstore = Qdrant.from_documents( | |
| documents=model_run_state.combined_document_objects, | |
| embedding=model_run_state.embedding_model, | |
| location=":memory:" | |
| ) | |
| qdrant_retriever = qdrant_vectorstore.as_retriever() | |
| model_run_state.retriever = qdrant_retriever | |
| print("Vector store created") | |
| def tiktoken_len(text): | |
| tokens = tiktoken.encoding_for_model("gpt-4o").encode( | |
| text, | |
| ) | |
| return len(tokens) | |
| def create_chunked_documents(app_state, model_run_state): | |
| dprint(app_state, model_run_state.chunking_strategy) | |
| if model_run_state.chunking_strategy == CHUNKING_STRATEGY_TABLE_AWARE: | |
| combined_document_objects = chunk_with_table_aware(app_state, model_run_state) | |
| elif model_run_state.chunking_strategy == CHUNKING_STRATEGY_SECTION_BASED: | |
| combined_document_objects = chunk_with_section_based(app_state, model_run_state) | |
| elif model_run_state.chunking_strategy == CHUNKING_STRATEGY_SEMANTIC: | |
| combined_document_objects = chunk_with_semantic_splitter(app_state, model_run_state) | |
| else: | |
| combined_document_objects = chunk_with_recursive_splitter(app_state, model_run_state) | |
| model_run_state.combined_document_objects = combined_document_objects | |
| dprint(app_state, "Chunking completed successfully") | |
| def chunk_with_recursive_splitter(app_state, model_run_state): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=model_run_state.chunk_size, | |
| chunk_overlap=model_run_state.chunk_overlap, | |
| length_function = tiktoken_len, | |
| ) | |
| combined_document_objects = [] | |
| dprint(app_state, "Chunking documents and creating document objects") | |
| for document in app_state.documents: | |
| dprint(app_state, f"processing documend: {document['title']}") | |
| text = document["single_text_document"] | |
| dprint(app_state, text) | |
| title = document["title"] | |
| # document_id = document["document_id"] | |
| chunks_document = text_splitter.split_text(text) | |
| dprint(app_state, len(chunks_document)) | |
| for chunk_number, chunk in enumerate(chunks_document, start=1): | |
| document_objects = Document( | |
| page_content=chunk, | |
| metadata={ | |
| "source": title, | |
| "document_id": document.get("document_id", "default_id"), | |
| "chunk_number": chunk_number # Add unique chunk number | |
| } | |
| ) | |
| combined_document_objects.append(document_objects) | |
| return combined_document_objects | |
| def chunk_with_table_aware(app_state, model_run_state): | |
| combined_document_objects = [] | |
| dprint(app_state, "Using Table-Aware Chunking for documents.") | |
| for document in app_state.documents: | |
| title = document["title"] | |
| text = document["single_text_document"] | |
| # Check if document is a PDF and contains tables | |
| if document.get("is_pdf", False): | |
| with pdfplumber.open(document["file_path"]) as pdf: | |
| for page in pdf.pages: | |
| tables = page.extract_tables() | |
| for table in tables: | |
| table_content = "\n".join([str(row) for row in table]) | |
| document_objects = Document( | |
| page_content=table_content, | |
| metadata={ | |
| "source": title, | |
| "document_id": document.get("document_id", "default_id"), | |
| "chunk_number": "table" | |
| } | |
| ) | |
| combined_document_objects.append(document_objects) | |
| # Chunk the rest of the text | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=model_run_state.chunk_size, chunk_overlap=model_run_state.chunk_overlap) | |
| chunks_document = text_splitter.split_text(text) | |
| for chunk_number, chunk in enumerate(chunks_document, start=1): | |
| document_objects = Document( | |
| page_content=chunk, | |
| metadata={ | |
| "source": title, | |
| "document_id": document.get("document_id", "default_id"), | |
| "chunk_number": chunk_number | |
| } | |
| ) | |
| combined_document_objects.append(document_objects) | |
| return combined_document_objects | |
| def chunk_with_section_based(app_state, model_run_state): | |
| combined_document_objects = [] | |
| dprint(app_state, "Using Section-Based Chunking for documents.") | |
| for document in app_state.documents: | |
| text = document["single_text_document"] | |
| title = document["title"] | |
| # Split the text by headings | |
| sections = re.split(r"\n[A-Z].+?\n", text) | |
| # Chunk each section | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=model_run_state.chunk_size, chunk_overlap=model_run_state.chunk_overlap) | |
| for section_number, section in enumerate(sections, start=1): | |
| chunks_document = text_splitter.split_text(section) | |
| for chunk_number, chunk in enumerate(chunks_document, start=1): | |
| document_objects = Document( | |
| page_content=chunk, | |
| metadata={ | |
| "source": title, | |
| "document_id": document.get("document_id", "default_id"), | |
| "section_number": section_number, | |
| "chunk_number": chunk_number | |
| } | |
| ) | |
| combined_document_objects.append(document_objects) | |
| return combined_document_objects | |
| def chunk_with_semantic_splitter(app_state, model_run_state): | |
| # Load pre-trained model for embeddings | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| combined_document_objects = [] | |
| dprint(app_state, "Using Semantic-Based Chunking for documents.") | |
| for document in app_state.documents: | |
| text = document["single_text_document"] | |
| title = document["title"] | |
| # Split text into sentences or paragraphs | |
| sentences = text.split(". ") # Simple split by sentence (you can refine this) | |
| sentence_embeddings = model.encode(sentences) | |
| # Group sentences into chunks based on semantic similarity | |
| chunks = [] | |
| current_chunk = [] | |
| for i in range(len(sentences) - 1): | |
| current_chunk.append(sentences[i]) | |
| # Calculate similarity between consecutive sentences | |
| sim = cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[i + 1]])[0][0] | |
| if sim < 0.7 or len(current_chunk) >= model_run_state.chunk_size: | |
| # If similarity is below threshold or chunk size is reached, start a new chunk | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [] | |
| # Add the final chunk | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| # Create document objects for the chunks | |
| for chunk_number, chunk in enumerate(chunks, start=1): | |
| document_objects = Document( | |
| page_content=chunk, | |
| metadata={ | |
| "source": title, | |
| "document_id": document.get("document_id", "default_id"), | |
| "chunk_number": chunk_number | |
| } | |
| ) | |
| combined_document_objects.append(document_objects) | |
| return combined_document_objects |