Spaces:
Sleeping
Sleeping
| <svg viewBox="0 0 1600 2000" xmlns="http://www.w3.org/2000/svg"> | |
| <!-- Background --> | |
| <rect width="1600" height="2000" fill="#F8FAFC"/> | |
| <!-- Title --> | |
| <text x="800" y="40" font-size="32" font-weight="bold" text-anchor="middle" fill="#1E293B">LangChain Document Structure</text> | |
| <text x="800" y="75" font-size="18" text-anchor="middle" fill="#64748B">Understanding the core components of LangChain Documents</text> | |
| <!-- Document Object Overview --> | |
| <g id="document-overview"> | |
| <rect x="200" y="120" width="1200" height="300" fill="#FFF" stroke="#3B82F6" stroke-width="3" rx="12"/> | |
| <!-- Document Icon --> | |
| <circle cx="300" cy="220" r="60" fill="#DBEAFE"/> | |
| <text x="300" y="235" font-size="48" text-anchor="middle">📄</text> | |
| <!-- Document Class --> | |
| <text x="450" y="170" font-size="24" font-weight="bold" fill="#1E293B">LangChain Document</text> | |
| <rect x="450" y="190" width="350" height="40" fill="#F3F4F6" stroke="#9CA3AF" stroke-width="1" rx="4"/> | |
| <text x="460" y="215" font-family="monospace" font-size="14" fill="#1F2937">from langchain.schema import Document</text> | |
| <!-- Key Properties --> | |
| <text x="450" y="270" font-size="16" fill="#64748B">Core Components:</text> | |
| <text x="450" y="300" font-size="18" font-weight="bold" fill="#3B82F6">• page_content (str)</text> | |
| <text x="450" y="330" font-size="18" font-weight="bold" fill="#10B981">• metadata (dict)</text> | |
| <!-- Code Example --> | |
| <rect x="850" y="160" width="500" height="220" fill="#1E293B" rx="8"/> | |
| <text x="870" y="185" font-family="monospace" font-size="12" fill="#F9FAFB"># Creating a Document</text> | |
| <text x="870" y="210" font-family="monospace" font-size="12" fill="#60A5FA">doc = Document(</text> | |
| <text x="890" y="235" font-family="monospace" font-size="12" fill="#F9FAFB">page_content=</text><text x="990" y="235" font-family="monospace" font-size="12" fill="#A5F3FC">"RAG is a technique..."</text><text x="1150" y="235" font-family="monospace" font-size="12" fill="#F9FAFB">,</text> | |
| <text x="890" y="260" font-family="monospace" font-size="12" fill="#F9FAFB">metadata={</text> | |
| <text x="910" y="285" font-family="monospace" font-size="12" fill="#A5F3FC">"source"</text><text x="970" y="285" font-family="monospace" font-size="12" fill="#F9FAFB">:</text><text x="990" y="285" font-family="monospace" font-size="12" fill="#A5F3FC">"chapter1.pdf"</text><text x="1090" y="285" font-family="monospace" font-size="12" fill="#F9FAFB">,</text> | |
| <text x="910" y="310" font-family="monospace" font-size="12" fill="#A5F3FC">"page"</text><text x="960" y="310" font-family="monospace" font-size="12" fill="#F9FAFB">:</text><text x="980" y="310" font-family="monospace" font-size="12" fill="#FBBF24">5</text><text x="990" y="310" font-family="monospace" font-size="12" fill="#F9FAFB">,</text> | |
| <text x="910" y="335" font-family="monospace" font-size="12" fill="#A5F3FC">"timestamp"</text><text x="1000" y="335" font-family="monospace" font-size="12" fill="#F9FAFB">:</text><text x="1020" y="335" font-family="monospace" font-size="12" fill="#A5F3FC">"2024-01-15"</text> | |
| <text x="890" y="360" font-family="monospace" font-size="12" fill="#F9FAFB">}</text> | |
| <text x="870" y="360" font-family="monospace" font-size="12" fill="#60A5FA">)</text> | |
| </g> | |
| <!-- Page Content Component --> | |
| <g id="page-content"> | |
| <rect x="50" y="480" width="700" height="600" fill="#FFF" stroke="#3B82F6" stroke-width="2" rx="12"/> | |
| <!-- Header --> | |
| <rect x="50" y="480" width="700" height="60" fill="#3B82F6" rx="12 12 0 0"/> | |
| <text x="400" y="515" font-size="22" font-weight="bold" text-anchor="middle" fill="#FFF">page_content (String)</text> | |
| <!-- Description --> | |
| <rect x="80" y="570" width="640" height="80" fill="#F0F9FF" stroke="#3B82F6" stroke-width="1" rx="8"/> | |
| <text x="100" y="595" font-size="14" fill="#1E40AF">The actual text content of the document</text> | |
| <text x="100" y="615" font-size="14" fill="#1E40AF">• Contains the main information to be embedded and searched</text> | |
| <text x="100" y="635" font-size="14" fill="#1E40AF">• Must be a string (can be any length)</text> | |
| <!-- Examples --> | |
| <text x="100" y="690" font-size="18" font-weight="bold" fill="#1E293B">Examples:</text> | |
| <!-- Example 1 --> | |
| <rect x="80" y="710" width="640" height="100" fill="#F3F4F6" stroke="#9CA3AF" stroke-width="1" rx="8"/> | |
| <text x="100" y="735" font-size="14" font-weight="bold" fill="#1F2937">Research Paper:</text> | |
| <text x="100" y="755" font-size="12" fill="#4B5563">"Retrieval-Augmented Generation (RAG) combines the benefits of</text> | |
| <text x="100" y="775" font-size="12" fill="#4B5563">pre-trained language models with information retrieval systems</text> | |
| <text x="100" y="795" font-size="12" fill="#4B5563">to generate more accurate and contextual responses..."</text> | |
| <!-- Example 2 --> | |
| <rect x="80" y="830" width="640" height="100" fill="#F3F4F6" stroke="#9CA3AF" stroke-width="1" rx="8"/> | |
| <text x="100" y="855" font-size="14" font-weight="bold" fill="#1F2937">Product Manual:</text> | |
| <text x="100" y="875" font-size="12" fill="#4B5563">"To install the software, first ensure your system meets the</text> | |
| <text x="100" y="895" font-size="12" fill="#4B5563">minimum requirements: Windows 10 or later, 8GB RAM, and</text> | |
| <text x="100" y="915" font-size="12" fill="#4B5563">at least 20GB of free disk space..."</text> | |
| <!-- Best Practices --> | |
| <rect x="80" y="950" width="640" height="100" fill="#F0FDF4" stroke="#10B981" stroke-width="1" rx="8"/> | |
| <text x="100" y="975" font-size="14" font-weight="bold" fill="#059669">✅ Best Practices:</text> | |
| <text x="100" y="1000" font-size="12" fill="#10B981">• Keep content focused and coherent</text> | |
| <text x="100" y="1020" font-size="12" fill="#10B981">• Remove unnecessary formatting before storage</text> | |
| <text x="100" y="1040" font-size="12" fill="#10B981">• Consider chunk size limits (typically 500-2000 tokens)</text> | |
| </g> | |
| <!-- Metadata Component --> | |
| <g id="metadata"> | |
| <rect x="850" y="480" width="700" height="600" fill="#FFF" stroke="#10B981" stroke-width="2" rx="12"/> | |
| <!-- Header --> | |
| <rect x="850" y="480" width="700" height="60" fill="#10B981" rx="12 12 0 0"/> | |
| <text x="1200" y="515" font-size="22" font-weight="bold" text-anchor="middle" fill="#FFF">metadata (Dictionary)</text> | |
| <!-- Description --> | |
| <rect x="880" y="570" width="640" height="80" fill="#F0FDF4" stroke="#10B981" stroke-width="1" rx="8"/> | |
| <text x="900" y="595" font-size="14" fill="#059669">Additional information about the document</text> | |
| <text x="900" y="615" font-size="14" fill="#059669">• Used for filtering, tracking, and context</text> | |
| <text x="900" y="635" font-size="14" fill="#059669">• Can contain any JSON-serializable data</text> | |
| <!-- Common Fields --> | |
| <text x="900" y="690" font-size="18" font-weight="bold" fill="#1E293B">Common Metadata Fields:</text> | |
| <!-- Metadata examples grid --> | |
| <g transform="translate(880, 720)"> | |
| <!-- Source --> | |
| <rect x="0" y="0" width="300" height="80" fill="#DBEAFE" stroke="#3B82F6" stroke-width="1" rx="6"/> | |
| <text x="10" y="25" font-size="14" font-weight="bold" fill="#1E40AF">source</text> | |
| <text x="10" y="45" font-size="12" fill="#64748B">File path or URL</text> | |
| <text x="10" y="65" font-family="monospace" font-size="11" fill="#1F2937">"docs/manual.pdf"</text> | |
| <!-- Page --> | |
| <rect x="320" y="0" width="300" height="80" fill="#DBEAFE" stroke="#3B82F6" stroke-width="1" rx="6"/> | |
| <text x="330" y="25" font-size="14" font-weight="bold" fill="#1E40AF">page / chunk_id</text> | |
| <text x="330" y="45" font-size="12" fill="#64748B">Location in document</text> | |
| <text x="330" y="65" font-family="monospace" font-size="11" fill="#1F2937">page: 42, chunk: 7</text> | |
| <!-- Timestamp --> | |
| <rect x="0" y="90" width="300" height="80" fill="#FEF3C7" stroke="#F59E0B" stroke-width="1" rx="6"/> | |
| <text x="10" y="115" font-size="14" font-weight="bold" fill="#92400E">timestamp</text> | |
| <text x="10" y="135" font-size="12" fill="#64748B">Creation/modification date</text> | |
| <text x="10" y="155" font-family="monospace" font-size="11" fill="#1F2937">"2024-01-15T10:30:00Z"</text> | |
| <!-- Author --> | |
| <rect x="320" y="90" width="300" height="80" fill="#FEF3C7" stroke="#F59E0B" stroke-width="1" rx="6"/> | |
| <text x="330" y="115" font-size="14" font-weight="bold" fill="#92400E">author</text> | |
| <text x="330" y="135" font-size="12" fill="#64748B">Document creator</text> | |
| <text x="330" y="155" font-family="monospace" font-size="11" fill="#1F2937">"John Doe"</text> | |
| <!-- Category --> | |
| <rect x="0" y="180" width="300" height="80" fill="#F3E8FF" stroke="#8B5CF6" stroke-width="1" rx="6"/> | |
| <text x="10" y="205" font-size="14" font-weight="bold" fill="#7C3AED">category / type</text> | |
| <text x="10" y="225" font-size="12" fill="#64748B">Document classification</text> | |
| <text x="10" y="245" font-family="monospace" font-size="11" fill="#1F2937">"technical", "legal"</text> | |
| <!-- Language --> | |
| <rect x="320" y="180" width="300" height="80" fill="#F3E8FF" stroke="#8B5CF6" stroke-width="1" rx="6"/> | |
| <text x="330" y="205" font-size="14" font-weight="bold" fill="#7C3AED">language</text> | |
| <text x="330" y="225" font-size="12" fill="#64748B">Content language</text> | |
| <text x="330" y="245" font-family="monospace" font-size="11" fill="#1F2937">"en", "es", "fr"</text> | |
| </g> | |
| <!-- Custom metadata example --> | |
| <rect x="880" y="1000" width="640" height="60" fill="#FDF2F8" stroke="#EC4899" stroke-width="1" rx="8"/> | |
| <text x="900" y="1025" font-size="14" font-weight="bold" fill="#DB2777">💡 Tip: Add custom fields for your use case</text> | |
| <text x="900" y="1045" font-size="12" fill="#9F1239">Examples: department, security_level, version, keywords, embeddings_model</text> | |
| </g> | |
| <!-- Document Loaders Section --> | |
| <g id="document-loaders"> | |
| <rect x="50" y="1120" width="1500" height="400" fill="#FFF" stroke="#8B5CF6" stroke-width="2" rx="12"/> | |
| <!-- Header --> | |
| <rect x="50" y="1120" width="1500" height="60" fill="#8B5CF6" rx="12 12 0 0"/> | |
| <text x="800" y="1155" font-size="22" font-weight="bold" text-anchor="middle" fill="#FFF">LangChain Document Loaders</text> | |
| <!-- Loader examples --> | |
| <g transform="translate(100, 1210)"> | |
| <!-- PDF Loader --> | |
| <rect x="0" y="0" width="340" height="140" fill="#FFF" stroke="#EF4444" stroke-width="2" rx="8"/> | |
| <text x="170" y="25" font-size="16" font-weight="bold" text-anchor="middle" fill="#DC2626">PDFLoader</text> | |
| <rect x="20" y="40" width="300" height="80" fill="#FEE2E2" rx="4"/> | |
| <text x="30" y="60" font-family="monospace" font-size="11" fill="#1F2937">from langchain.document_loaders import PyPDFLoader</text> | |
| <text x="30" y="80" font-family="monospace" font-size="11" fill="#1F2937">loader = PyPDFLoader("file.pdf")</text> | |
| <text x="30" y="100" font-family="monospace" font-size="11" fill="#1F2937">documents = loader.load()</text> | |
| <!-- CSV Loader --> | |
| <rect x="360" y="0" width="340" height="140" fill="#FFF" stroke="#F59E0B" stroke-width="2" rx="8"/> | |
| <text x="530" y="25" font-size="16" font-weight="bold" text-anchor="middle" fill="#D97706">CSVLoader</text> | |
| <rect x="380" y="40" width="300" height="80" fill="#FEF3C7" rx="4"/> | |
| <text x="390" y="60" font-family="monospace" font-size="11" fill="#1F2937">from langchain.document_loaders import CSVLoader</text> | |
| <text x="390" y="80" font-family="monospace" font-size="11" fill="#1F2937">loader = CSVLoader("data.csv")</text> | |
| <text x="390" y="100" font-family="monospace" font-size="11" fill="#1F2937">documents = loader.load()</text> | |
| <!-- Web Loader --> | |
| <rect x="720" y="0" width="340" height="140" fill="#FFF" stroke="#3B82F6" stroke-width="2" rx="8"/> | |
| <text x="890" y="25" font-size="16" font-weight="bold" text-anchor="middle" fill="#2563EB">WebBaseLoader</text> | |
| <rect x="740" y="40" width="300" height="80" fill="#DBEAFE" rx="4"/> | |
| <text x="750" y="60" font-family="monospace" font-size="11" fill="#1F2937">from langchain.document_loaders import WebBaseLoader</text> | |
| <text x="750" y="80" font-family="monospace" font-size="11" fill="#1F2937">loader = WebBaseLoader("https://...")</text> | |
| <text x="750" y="100" font-family="monospace" font-size="11" fill="#1F2937">documents = loader.load()</text> | |
| <!-- Directory Loader --> | |
| <rect x="1080" y="0" width="340" height="140" fill="#FFF" stroke="#10B981" stroke-width="2" rx="8"/> | |
| <text x="1250" y="25" font-size="16" font-weight="bold" text-anchor="middle" fill="#059669">DirectoryLoader</text> | |
| <rect x="1100" y="40" width="300" height="80" fill="#D1FAE5" rx="4"/> | |
| <text x="1110" y="60" font-family="monospace" font-size="11" fill="#1F2937">from langchain.document_loaders import DirectoryLoader</text> | |
| <text x="1110" y="80" font-family="monospace" font-size="11" fill="#1F2937">loader = DirectoryLoader("./docs")</text> | |
| <text x="1110" y="100" font-family="monospace" font-size="11" fill="#1F2937">documents = loader.load()</text> | |
| </g> | |
| <!-- More loaders --> | |
| <text x="100" y="1380" font-size="16" fill="#64748B">Additional Loaders:</text> | |
| <text x="100" y="1410" font-size="14" fill="#4B5563">• UnstructuredLoader (multiple formats)</text> | |
| <text x="100" y="1435" font-size="14" fill="#4B5563">• JSONLoader</text> | |
| <text x="100" y="1460" font-size="14" fill="#4B5563">• TextLoader</text> | |
| <text x="100" y="1485" font-size="14" fill="#4B5563">• GitbookLoader</text> | |
| <text x="400" y="1410" font-size="14" fill="#4B5563">• NotionDirectoryLoader</text> | |
| <text x="400" y="1435" font-size="14" fill="#4B5563">• GoogleDriveLoader</text> | |
| <text x="400" y="1460" font-size="14" fill="#4B5563">• AirtableLoader</text> | |
| <text x="400" y="1485" font-size="14" fill="#4B5563">• SlackDirectoryLoader</text> | |
| <text x="700" y="1410" font-size="14" fill="#4B5563">• S3FileLoader</text> | |
| <text x="700" y="1435" font-size="14" fill="#4B5563">• YouTubeLoader</text> | |
| <text x="700" y="1460" font-size="14" fill="#4B5563">• WikipediaLoader</text> | |
| <text x="700" y="1485" font-size="14" fill="#4B5563">• ArxivLoader</text> | |
| <text x="1000" y="1410" font-size="14" fill="#4B5563">• ConfluenceLoader</text> | |
| <text x="1000" y="1435" font-size="14" fill="#4B5563">• DocugamiLoader</text> | |
| <text x="1000" y="1460" font-size="14" fill="#4B5563">• EverNoteLoader</text> | |
| <text x="1000" y="1485" font-size="14" fill="#4B5563">• HuggingFaceDatasetLoader</text> | |
| </g> | |
| <!-- Document Transformers --> | |
| <g id="transformers"> | |
| <rect x="50" y="1560" width="1500" height="380" fill="#FFF" stroke="#EC4899" stroke-width="2" rx="12"/> | |
| <!-- Header --> | |
| <rect x="50" y="1560" width="1500" height="60" fill="#EC4899" rx="12 12 0 0"/> | |
| <text x="800" y="1595" font-size="22" font-weight="bold" text-anchor="middle" fill="#FFF">Document Transformers (Text Splitters)</text> | |
| <!-- Splitter types --> | |
| <g transform="translate(100, 1650)"> | |
| <!-- Character Splitter --> | |
| <rect x="0" y="0" width="340" height="120" fill="#FDF2F8" stroke="#EC4899" stroke-width="1" rx="8"/> | |
| <text x="170" y="25" font-size="14" font-weight="bold" text-anchor="middle" fill="#DB2777">CharacterTextSplitter</text> | |
| <text x="20" y="50" font-family="monospace" font-size="10" fill="#1F2937">splitter = CharacterTextSplitter(</text> | |
| <text x="40" y="70" font-family="monospace" font-size="10" fill="#1F2937">chunk_size=1000,</text> | |
| <text x="40" y="90" font-family="monospace" font-size="10" fill="#1F2937">chunk_overlap=200</text> | |
| <text x="20" y="110" font-family="monospace" font-size="10" fill="#1F2937">)</text> | |
| <!-- Recursive Splitter --> | |
| <rect x="360" y="0" width="340" height="120" fill="#F0F9FF" stroke="#3B82F6" stroke-width="1" rx="8"/> | |
| <text x="530" y="25" font-size="14" font-weight="bold" text-anchor="middle" fill="#2563EB">RecursiveCharacterTextSplitter</text> | |
| <text x="380" y="50" font-family="monospace" font-size="10" fill="#1F2937">splitter = RecursiveCharacterTextSplitter(</text> | |
| <text x="400" y="70" font-family="monospace" font-size="10" fill="#1F2937">chunk_size=1000,</text> | |
| <text x="400" y="90" font-family="monospace" font-size="10" fill="#1F2937">separators=["\n\n", "\n", " "]</text> | |
| <text x="380" y="110" font-family="monospace" font-size="10" fill="#1F2937">)</text> | |
| <!-- Token Splitter --> | |
| <rect x="720" y="0" width="340" height="120" fill="#F0FDF4" stroke="#10B981" stroke-width="1" rx="8"/> | |
| <text x="890" y="25" font-size="14" font-weight="bold" text-anchor="middle" fill="#059669">TokenTextSplitter</text> | |
| <text x="740" y="50" font-family="monospace" font-size="10" fill="#1F2937">splitter = TokenTextSplitter(</text> | |
| <text x="760" y="70" font-family="monospace" font-size="10" fill="#1F2937">chunk_size=500,</text> | |
| <text x="760" y="90" font-family="monospace" font-size="10" fill="#1F2937">model_name="gpt-3.5-turbo"</text> | |
| <text x="740" y="110" font-family="monospace" font-size="10" fill="#1F2937">)</text> | |
| <!-- Semantic Splitter --> | |
| <rect x="1080" y="0" width="340" height="120" fill="#F3E8FF" stroke="#8B5CF6" stroke-width="1" rx="8"/> | |
| <text x="1250" y="25" font-size="14" font-weight="bold" text-anchor="middle" fill="#7C3AED">SemanticChunker</text> | |
| <text x="1100" y="50" font-family="monospace" font-size="10" fill="#1F2937">splitter = SemanticChunker(</text> | |
| <text x="1120" y="70" font-family="monospace" font-size="10" fill="#1F2937">embeddings,</text> | |
| <text x="1120" y="90" font-family="monospace" font-size="10" fill="#1F2937">breakpoint_threshold_type="percentile"</text> | |
| <text x="1100" y="110" font-family="monospace" font-size="10" fill="#1F2937">)</text> | |
| </g> | |
| <!-- Usage example --> | |
| <rect x="100" y="1800" width="1400" height="100" fill="#1E293B" rx="8"/> | |
| <text x="120" y="1825" font-family="monospace" font-size="12" fill="#F9FAFB"># Split documents into chunks</text> | |
| <text x="120" y="1850" font-family="monospace" font-size="12" fill="#60A5FA">chunks = splitter.split_documents(documents)</text> | |
| <text x="120" y="1875" font-family="monospace" font-size="12" fill="#F9FAFB"># Each chunk is a new Document with preserved metadata</text> | |
| </g> | |
| <!-- Arrow definitions --> | |
| <defs> | |
| <marker id="arrowblue" markerWidth="10" markerHeight="10" refX="5" refY="5" orient="auto"> | |
| <path d="M0,0 L0,10 L10,5 z" fill="#3B82F6"/> | |
| </marker> | |
| <marker id="arrowgreen" markerWidth="10" markerHeight="10" refX="5" refY="5" orient="auto"> | |
| <path d="M0,0 L0,10 L10,5 z" fill="#10B981"/> | |
| </marker> | |
| </defs> | |
| </svg> |