pdf-chatbot / notebook /1-langchain-document-components.svg
manasvi63
Complete Pipeline
bf10662
<svg viewBox="0 0 1600 2000" xmlns="http://www.w3.org/2000/svg">
<!-- Background -->
<rect width="1600" height="2000" fill="#F8FAFC"/>
<!-- Title -->
<text x="800" y="40" font-size="32" font-weight="bold" text-anchor="middle" fill="#1E293B">LangChain Document Structure</text>
<text x="800" y="75" font-size="18" text-anchor="middle" fill="#64748B">Understanding the core components of LangChain Documents</text>
<!-- Document Object Overview -->
<g id="document-overview">
<rect x="200" y="120" width="1200" height="300" fill="#FFF" stroke="#3B82F6" stroke-width="3" rx="12"/>
<!-- Document Icon -->
<circle cx="300" cy="220" r="60" fill="#DBEAFE"/>
<text x="300" y="235" font-size="48" text-anchor="middle">📄</text>
<!-- Document Class -->
<text x="450" y="170" font-size="24" font-weight="bold" fill="#1E293B">LangChain Document</text>
<rect x="450" y="190" width="350" height="40" fill="#F3F4F6" stroke="#9CA3AF" stroke-width="1" rx="4"/>
<text x="460" y="215" font-family="monospace" font-size="14" fill="#1F2937">from langchain.schema import Document</text>
<!-- Key Properties -->
<text x="450" y="270" font-size="16" fill="#64748B">Core Components:</text>
<text x="450" y="300" font-size="18" font-weight="bold" fill="#3B82F6">• page_content (str)</text>
<text x="450" y="330" font-size="18" font-weight="bold" fill="#10B981">• metadata (dict)</text>
<!-- Code Example -->
<rect x="850" y="160" width="500" height="220" fill="#1E293B" rx="8"/>
<text x="870" y="185" font-family="monospace" font-size="12" fill="#F9FAFB"># Creating a Document</text>
<text x="870" y="210" font-family="monospace" font-size="12" fill="#60A5FA">doc = Document(</text>
<text x="890" y="235" font-family="monospace" font-size="12" fill="#F9FAFB">page_content=</text><text x="990" y="235" font-family="monospace" font-size="12" fill="#A5F3FC">"RAG is a technique..."</text><text x="1150" y="235" font-family="monospace" font-size="12" fill="#F9FAFB">,</text>
<text x="890" y="260" font-family="monospace" font-size="12" fill="#F9FAFB">metadata={</text>
<text x="910" y="285" font-family="monospace" font-size="12" fill="#A5F3FC">"source"</text><text x="970" y="285" font-family="monospace" font-size="12" fill="#F9FAFB">:</text><text x="990" y="285" font-family="monospace" font-size="12" fill="#A5F3FC">"chapter1.pdf"</text><text x="1090" y="285" font-family="monospace" font-size="12" fill="#F9FAFB">,</text>
<text x="910" y="310" font-family="monospace" font-size="12" fill="#A5F3FC">"page"</text><text x="960" y="310" font-family="monospace" font-size="12" fill="#F9FAFB">:</text><text x="980" y="310" font-family="monospace" font-size="12" fill="#FBBF24">5</text><text x="990" y="310" font-family="monospace" font-size="12" fill="#F9FAFB">,</text>
<text x="910" y="335" font-family="monospace" font-size="12" fill="#A5F3FC">"timestamp"</text><text x="1000" y="335" font-family="monospace" font-size="12" fill="#F9FAFB">:</text><text x="1020" y="335" font-family="monospace" font-size="12" fill="#A5F3FC">"2024-01-15"</text>
<text x="890" y="360" font-family="monospace" font-size="12" fill="#F9FAFB">}</text>
<text x="870" y="360" font-family="monospace" font-size="12" fill="#60A5FA">)</text>
</g>
<!-- Page Content Component -->
<g id="page-content">
<rect x="50" y="480" width="700" height="600" fill="#FFF" stroke="#3B82F6" stroke-width="2" rx="12"/>
<!-- Header -->
<rect x="50" y="480" width="700" height="60" fill="#3B82F6" rx="12 12 0 0"/>
<text x="400" y="515" font-size="22" font-weight="bold" text-anchor="middle" fill="#FFF">page_content (String)</text>
<!-- Description -->
<rect x="80" y="570" width="640" height="80" fill="#F0F9FF" stroke="#3B82F6" stroke-width="1" rx="8"/>
<text x="100" y="595" font-size="14" fill="#1E40AF">The actual text content of the document</text>
<text x="100" y="615" font-size="14" fill="#1E40AF">• Contains the main information to be embedded and searched</text>
<text x="100" y="635" font-size="14" fill="#1E40AF">• Must be a string (can be any length)</text>
<!-- Examples -->
<text x="100" y="690" font-size="18" font-weight="bold" fill="#1E293B">Examples:</text>
<!-- Example 1 -->
<rect x="80" y="710" width="640" height="100" fill="#F3F4F6" stroke="#9CA3AF" stroke-width="1" rx="8"/>
<text x="100" y="735" font-size="14" font-weight="bold" fill="#1F2937">Research Paper:</text>
<text x="100" y="755" font-size="12" fill="#4B5563">"Retrieval-Augmented Generation (RAG) combines the benefits of</text>
<text x="100" y="775" font-size="12" fill="#4B5563">pre-trained language models with information retrieval systems</text>
<text x="100" y="795" font-size="12" fill="#4B5563">to generate more accurate and contextual responses..."</text>
<!-- Example 2 -->
<rect x="80" y="830" width="640" height="100" fill="#F3F4F6" stroke="#9CA3AF" stroke-width="1" rx="8"/>
<text x="100" y="855" font-size="14" font-weight="bold" fill="#1F2937">Product Manual:</text>
<text x="100" y="875" font-size="12" fill="#4B5563">"To install the software, first ensure your system meets the</text>
<text x="100" y="895" font-size="12" fill="#4B5563">minimum requirements: Windows 10 or later, 8GB RAM, and</text>
<text x="100" y="915" font-size="12" fill="#4B5563">at least 20GB of free disk space..."</text>
<!-- Best Practices -->
<rect x="80" y="950" width="640" height="100" fill="#F0FDF4" stroke="#10B981" stroke-width="1" rx="8"/>
<text x="100" y="975" font-size="14" font-weight="bold" fill="#059669">✅ Best Practices:</text>
<text x="100" y="1000" font-size="12" fill="#10B981">• Keep content focused and coherent</text>
<text x="100" y="1020" font-size="12" fill="#10B981">• Remove unnecessary formatting before storage</text>
<text x="100" y="1040" font-size="12" fill="#10B981">• Consider chunk size limits (typically 500-2000 tokens)</text>
</g>
<!-- Metadata Component -->
<g id="metadata">
<rect x="850" y="480" width="700" height="600" fill="#FFF" stroke="#10B981" stroke-width="2" rx="12"/>
<!-- Header -->
<rect x="850" y="480" width="700" height="60" fill="#10B981" rx="12 12 0 0"/>
<text x="1200" y="515" font-size="22" font-weight="bold" text-anchor="middle" fill="#FFF">metadata (Dictionary)</text>
<!-- Description -->
<rect x="880" y="570" width="640" height="80" fill="#F0FDF4" stroke="#10B981" stroke-width="1" rx="8"/>
<text x="900" y="595" font-size="14" fill="#059669">Additional information about the document</text>
<text x="900" y="615" font-size="14" fill="#059669">• Used for filtering, tracking, and context</text>
<text x="900" y="635" font-size="14" fill="#059669">• Can contain any JSON-serializable data</text>
<!-- Common Fields -->
<text x="900" y="690" font-size="18" font-weight="bold" fill="#1E293B">Common Metadata Fields:</text>
<!-- Metadata examples grid -->
<g transform="translate(880, 720)">
<!-- Source -->
<rect x="0" y="0" width="300" height="80" fill="#DBEAFE" stroke="#3B82F6" stroke-width="1" rx="6"/>
<text x="10" y="25" font-size="14" font-weight="bold" fill="#1E40AF">source</text>
<text x="10" y="45" font-size="12" fill="#64748B">File path or URL</text>
<text x="10" y="65" font-family="monospace" font-size="11" fill="#1F2937">"docs/manual.pdf"</text>
<!-- Page -->
<rect x="320" y="0" width="300" height="80" fill="#DBEAFE" stroke="#3B82F6" stroke-width="1" rx="6"/>
<text x="330" y="25" font-size="14" font-weight="bold" fill="#1E40AF">page / chunk_id</text>
<text x="330" y="45" font-size="12" fill="#64748B">Location in document</text>
<text x="330" y="65" font-family="monospace" font-size="11" fill="#1F2937">page: 42, chunk: 7</text>
<!-- Timestamp -->
<rect x="0" y="90" width="300" height="80" fill="#FEF3C7" stroke="#F59E0B" stroke-width="1" rx="6"/>
<text x="10" y="115" font-size="14" font-weight="bold" fill="#92400E">timestamp</text>
<text x="10" y="135" font-size="12" fill="#64748B">Creation/modification date</text>
<text x="10" y="155" font-family="monospace" font-size="11" fill="#1F2937">"2024-01-15T10:30:00Z"</text>
<!-- Author -->
<rect x="320" y="90" width="300" height="80" fill="#FEF3C7" stroke="#F59E0B" stroke-width="1" rx="6"/>
<text x="330" y="115" font-size="14" font-weight="bold" fill="#92400E">author</text>
<text x="330" y="135" font-size="12" fill="#64748B">Document creator</text>
<text x="330" y="155" font-family="monospace" font-size="11" fill="#1F2937">"John Doe"</text>
<!-- Category -->
<rect x="0" y="180" width="300" height="80" fill="#F3E8FF" stroke="#8B5CF6" stroke-width="1" rx="6"/>
<text x="10" y="205" font-size="14" font-weight="bold" fill="#7C3AED">category / type</text>
<text x="10" y="225" font-size="12" fill="#64748B">Document classification</text>
<text x="10" y="245" font-family="monospace" font-size="11" fill="#1F2937">"technical", "legal"</text>
<!-- Language -->
<rect x="320" y="180" width="300" height="80" fill="#F3E8FF" stroke="#8B5CF6" stroke-width="1" rx="6"/>
<text x="330" y="205" font-size="14" font-weight="bold" fill="#7C3AED">language</text>
<text x="330" y="225" font-size="12" fill="#64748B">Content language</text>
<text x="330" y="245" font-family="monospace" font-size="11" fill="#1F2937">"en", "es", "fr"</text>
</g>
<!-- Custom metadata example -->
<rect x="880" y="1000" width="640" height="60" fill="#FDF2F8" stroke="#EC4899" stroke-width="1" rx="8"/>
<text x="900" y="1025" font-size="14" font-weight="bold" fill="#DB2777">💡 Tip: Add custom fields for your use case</text>
<text x="900" y="1045" font-size="12" fill="#9F1239">Examples: department, security_level, version, keywords, embeddings_model</text>
</g>
<!-- Document Loaders Section -->
<g id="document-loaders">
<rect x="50" y="1120" width="1500" height="400" fill="#FFF" stroke="#8B5CF6" stroke-width="2" rx="12"/>
<!-- Header -->
<rect x="50" y="1120" width="1500" height="60" fill="#8B5CF6" rx="12 12 0 0"/>
<text x="800" y="1155" font-size="22" font-weight="bold" text-anchor="middle" fill="#FFF">LangChain Document Loaders</text>
<!-- Loader examples -->
<g transform="translate(100, 1210)">
<!-- PDF Loader -->
<rect x="0" y="0" width="340" height="140" fill="#FFF" stroke="#EF4444" stroke-width="2" rx="8"/>
<text x="170" y="25" font-size="16" font-weight="bold" text-anchor="middle" fill="#DC2626">PDFLoader</text>
<rect x="20" y="40" width="300" height="80" fill="#FEE2E2" rx="4"/>
<text x="30" y="60" font-family="monospace" font-size="11" fill="#1F2937">from langchain.document_loaders import PyPDFLoader</text>
<text x="30" y="80" font-family="monospace" font-size="11" fill="#1F2937">loader = PyPDFLoader("file.pdf")</text>
<text x="30" y="100" font-family="monospace" font-size="11" fill="#1F2937">documents = loader.load()</text>
<!-- CSV Loader -->
<rect x="360" y="0" width="340" height="140" fill="#FFF" stroke="#F59E0B" stroke-width="2" rx="8"/>
<text x="530" y="25" font-size="16" font-weight="bold" text-anchor="middle" fill="#D97706">CSVLoader</text>
<rect x="380" y="40" width="300" height="80" fill="#FEF3C7" rx="4"/>
<text x="390" y="60" font-family="monospace" font-size="11" fill="#1F2937">from langchain.document_loaders import CSVLoader</text>
<text x="390" y="80" font-family="monospace" font-size="11" fill="#1F2937">loader = CSVLoader("data.csv")</text>
<text x="390" y="100" font-family="monospace" font-size="11" fill="#1F2937">documents = loader.load()</text>
<!-- Web Loader -->
<rect x="720" y="0" width="340" height="140" fill="#FFF" stroke="#3B82F6" stroke-width="2" rx="8"/>
<text x="890" y="25" font-size="16" font-weight="bold" text-anchor="middle" fill="#2563EB">WebBaseLoader</text>
<rect x="740" y="40" width="300" height="80" fill="#DBEAFE" rx="4"/>
<text x="750" y="60" font-family="monospace" font-size="11" fill="#1F2937">from langchain.document_loaders import WebBaseLoader</text>
<text x="750" y="80" font-family="monospace" font-size="11" fill="#1F2937">loader = WebBaseLoader("https://...")</text>
<text x="750" y="100" font-family="monospace" font-size="11" fill="#1F2937">documents = loader.load()</text>
<!-- Directory Loader -->
<rect x="1080" y="0" width="340" height="140" fill="#FFF" stroke="#10B981" stroke-width="2" rx="8"/>
<text x="1250" y="25" font-size="16" font-weight="bold" text-anchor="middle" fill="#059669">DirectoryLoader</text>
<rect x="1100" y="40" width="300" height="80" fill="#D1FAE5" rx="4"/>
<text x="1110" y="60" font-family="monospace" font-size="11" fill="#1F2937">from langchain.document_loaders import DirectoryLoader</text>
<text x="1110" y="80" font-family="monospace" font-size="11" fill="#1F2937">loader = DirectoryLoader("./docs")</text>
<text x="1110" y="100" font-family="monospace" font-size="11" fill="#1F2937">documents = loader.load()</text>
</g>
<!-- More loaders -->
<text x="100" y="1380" font-size="16" fill="#64748B">Additional Loaders:</text>
<text x="100" y="1410" font-size="14" fill="#4B5563">• UnstructuredLoader (multiple formats)</text>
<text x="100" y="1435" font-size="14" fill="#4B5563">• JSONLoader</text>
<text x="100" y="1460" font-size="14" fill="#4B5563">• TextLoader</text>
<text x="100" y="1485" font-size="14" fill="#4B5563">• GitbookLoader</text>
<text x="400" y="1410" font-size="14" fill="#4B5563">• NotionDirectoryLoader</text>
<text x="400" y="1435" font-size="14" fill="#4B5563">• GoogleDriveLoader</text>
<text x="400" y="1460" font-size="14" fill="#4B5563">• AirtableLoader</text>
<text x="400" y="1485" font-size="14" fill="#4B5563">• SlackDirectoryLoader</text>
<text x="700" y="1410" font-size="14" fill="#4B5563">• S3FileLoader</text>
<text x="700" y="1435" font-size="14" fill="#4B5563">• YouTubeLoader</text>
<text x="700" y="1460" font-size="14" fill="#4B5563">• WikipediaLoader</text>
<text x="700" y="1485" font-size="14" fill="#4B5563">• ArxivLoader</text>
<text x="1000" y="1410" font-size="14" fill="#4B5563">• ConfluenceLoader</text>
<text x="1000" y="1435" font-size="14" fill="#4B5563">• DocugamiLoader</text>
<text x="1000" y="1460" font-size="14" fill="#4B5563">• EverNoteLoader</text>
<text x="1000" y="1485" font-size="14" fill="#4B5563">• HuggingFaceDatasetLoader</text>
</g>
<!-- Document Transformers -->
<g id="transformers">
<rect x="50" y="1560" width="1500" height="380" fill="#FFF" stroke="#EC4899" stroke-width="2" rx="12"/>
<!-- Header -->
<rect x="50" y="1560" width="1500" height="60" fill="#EC4899" rx="12 12 0 0"/>
<text x="800" y="1595" font-size="22" font-weight="bold" text-anchor="middle" fill="#FFF">Document Transformers (Text Splitters)</text>
<!-- Splitter types -->
<g transform="translate(100, 1650)">
<!-- Character Splitter -->
<rect x="0" y="0" width="340" height="120" fill="#FDF2F8" stroke="#EC4899" stroke-width="1" rx="8"/>
<text x="170" y="25" font-size="14" font-weight="bold" text-anchor="middle" fill="#DB2777">CharacterTextSplitter</text>
<text x="20" y="50" font-family="monospace" font-size="10" fill="#1F2937">splitter = CharacterTextSplitter(</text>
<text x="40" y="70" font-family="monospace" font-size="10" fill="#1F2937">chunk_size=1000,</text>
<text x="40" y="90" font-family="monospace" font-size="10" fill="#1F2937">chunk_overlap=200</text>
<text x="20" y="110" font-family="monospace" font-size="10" fill="#1F2937">)</text>
<!-- Recursive Splitter -->
<rect x="360" y="0" width="340" height="120" fill="#F0F9FF" stroke="#3B82F6" stroke-width="1" rx="8"/>
<text x="530" y="25" font-size="14" font-weight="bold" text-anchor="middle" fill="#2563EB">RecursiveCharacterTextSplitter</text>
<text x="380" y="50" font-family="monospace" font-size="10" fill="#1F2937">splitter = RecursiveCharacterTextSplitter(</text>
<text x="400" y="70" font-family="monospace" font-size="10" fill="#1F2937">chunk_size=1000,</text>
<text x="400" y="90" font-family="monospace" font-size="10" fill="#1F2937">separators=["\n\n", "\n", " "]</text>
<text x="380" y="110" font-family="monospace" font-size="10" fill="#1F2937">)</text>
<!-- Token Splitter -->
<rect x="720" y="0" width="340" height="120" fill="#F0FDF4" stroke="#10B981" stroke-width="1" rx="8"/>
<text x="890" y="25" font-size="14" font-weight="bold" text-anchor="middle" fill="#059669">TokenTextSplitter</text>
<text x="740" y="50" font-family="monospace" font-size="10" fill="#1F2937">splitter = TokenTextSplitter(</text>
<text x="760" y="70" font-family="monospace" font-size="10" fill="#1F2937">chunk_size=500,</text>
<text x="760" y="90" font-family="monospace" font-size="10" fill="#1F2937">model_name="gpt-3.5-turbo"</text>
<text x="740" y="110" font-family="monospace" font-size="10" fill="#1F2937">)</text>
<!-- Semantic Splitter -->
<rect x="1080" y="0" width="340" height="120" fill="#F3E8FF" stroke="#8B5CF6" stroke-width="1" rx="8"/>
<text x="1250" y="25" font-size="14" font-weight="bold" text-anchor="middle" fill="#7C3AED">SemanticChunker</text>
<text x="1100" y="50" font-family="monospace" font-size="10" fill="#1F2937">splitter = SemanticChunker(</text>
<text x="1120" y="70" font-family="monospace" font-size="10" fill="#1F2937">embeddings,</text>
<text x="1120" y="90" font-family="monospace" font-size="10" fill="#1F2937">breakpoint_threshold_type="percentile"</text>
<text x="1100" y="110" font-family="monospace" font-size="10" fill="#1F2937">)</text>
</g>
<!-- Usage example -->
<rect x="100" y="1800" width="1400" height="100" fill="#1E293B" rx="8"/>
<text x="120" y="1825" font-family="monospace" font-size="12" fill="#F9FAFB"># Split documents into chunks</text>
<text x="120" y="1850" font-family="monospace" font-size="12" fill="#60A5FA">chunks = splitter.split_documents(documents)</text>
<text x="120" y="1875" font-family="monospace" font-size="12" fill="#F9FAFB"># Each chunk is a new Document with preserved metadata</text>
</g>
<!-- Arrow definitions -->
<defs>
<marker id="arrowblue" markerWidth="10" markerHeight="10" refX="5" refY="5" orient="auto">
<path d="M0,0 L0,10 L10,5 z" fill="#3B82F6"/>
</marker>
<marker id="arrowgreen" markerWidth="10" markerHeight="10" refX="5" refY="5" orient="auto">
<path d="M0,0 L0,10 L10,5 z" fill="#10B981"/>
</marker>
</defs>
</svg>