import os from pathlib import Path from typing import List def read_markdown_files(docs_path: Path) -> List[str]: """Reads all markdown files from the specified path and returns their content.""" markdown_content = [] for md_file in docs_path.glob("*.md"): with open(md_file, "r", encoding="utf-8") as f: markdown_content.append(f.read()) return markdown_content def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[str]: """Chunks the given text into smaller pieces.""" # This is a very basic chunking implementation. # A more sophisticated approach would consider sentence boundaries, # Docusaurus-specific markdown structure, etc. words = text.split() chunks = [] for i in range(0, len(words), chunk_size - chunk_overlap): chunk = " ".join(words[i:i + chunk_size]) if chunk: chunks.append(chunk) return chunks def generate_embeddings(text_chunks: List[str]) -> List[List[float]]: """Generates embeddings for the given text chunks. Placeholder for Gemini model integration.""" print("Generating embeddings (placeholder)...") # In a real implementation, this would call a Gemini model to get embeddings # For now, return dummy embeddings return [[0.1] * 768 for _ in text_chunks] # Assuming a 768-dimensional embedding for now def store_in_qdrant(embeddings: List[List[float]], metadatas: List[dict]): """Stores embeddings and metadata in Qdrant. Placeholder for Qdrant client integration.""" print(f"Storing {len(embeddings)} embeddings in Qdrant (placeholder)...") # In a real implementation, this would interact with Qdrant client pass def process_content(docs_root: Path): """Orchestrates the content processing pipeline.""" print(f"Processing markdown files from: {docs_root}") all_markdown_text = read_markdown_files(docs_root) all_chunks = [] # Simple metadata for now, can be expanded to include source file, chapter, etc. all_metadatas = [] for i, text in enumerate(all_markdown_text): chunks = chunk_text(text) all_chunks.extend(chunks) all_metadatas.extend([{"source": f"chapter_{i+1}"}] * len(chunks)) # Example metadata if not all_chunks: print("No content to process.") return embeddings = generate_embeddings(all_chunks) store_in_qdrant(embeddings, all_metadatas) print("Content processing complete.") if __name__ == "__main__": # Example usage: # Assuming the script is run from the project root (D:\hackathon_01) # and docs are in frontend/docs frontend_docs_path = Path(__file__).parent.parent.parent.parent / "frontend" / "docs" process_content(frontend_docs_path)