Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| <html lang=en dir=ltr class="docs-wrapper plugin-docs plugin-id-default docs-version-current docs-doc-page docs-doc-id-deployment/storage" data-has-hydrated=false><head><meta charset=UTF-8><meta name=generator content="Docusaurus v3.10.0"><title data-rh=true>π° COST-EFFECTIVE STORAGE STRATEGY (Personal Budget) | Open Navigator</title><meta data-rh=true name=viewport content="width=device-width, initial-scale=1.0"/><meta data-rh=true property=og:image content=https://www.communityone.com/img/docusaurus-social-card.jpg /><meta data-rh=true name=twitter:image content=https://www.communityone.com/img/docusaurus-social-card.jpg /><meta data-rh=true property=og:url content=https://www.communityone.com/docs/deployment/storage /><meta data-rh=true property=og:locale content=en /><meta data-rh=true name=docusaurus_locale content=en /><meta data-rh=true name=docsearch:language content=en /><meta data-rh=true name=keywords content="civic engagement, policy tracking, meeting minutes, nonprofit tracking, municipal government, advocacy, open data, local government"/><meta data-rh=true property=og:type content=website /><meta data-rh=true property=og:site_name content="Open Navigator"/><meta data-rh=true name=twitter:card content=summary_large_image /><meta data-rh=true name=docusaurus_version content=current /><meta data-rh=true name=docusaurus_tag content=docs-default-current /><meta data-rh=true name=docsearch:version content=current /><meta data-rh=true name=docsearch:docusaurus_tag content=docs-default-current /><meta data-rh=true property=og:title content="π° COST-EFFECTIVE STORAGE STRATEGY (Personal Budget) | Open Navigator"/><meta data-rh=true name=description content="TL;DR: Use Hugging Face Datasets - it's FREE and unlimited for public data!"/><meta data-rh=true property=og:description content="TL;DR: Use Hugging Face Datasets - it's FREE and unlimited for public data!"/><link data-rh=true rel=icon href=/img/favicon.ico /><link data-rh=true rel=canonical href=https://www.communityone.com/docs/deployment/storage /><link data-rh=true rel=alternate href=https://www.communityone.com/docs/deployment/storage hreflang=en /><link data-rh=true rel=alternate href=https://www.communityone.com/docs/deployment/storage hreflang=x-default /><script data-rh=true type=application/ld+json>{"@context":"https://schema.org","@type":"BreadcrumbList","itemListElement":[{"@type":"ListItem","item":"https://www.communityone.com/docs/for-developers","name":"Developers & Technical Users","position":1},{"@type":"ListItem","item":"https://www.communityone.com/docs/deployment/storage","name":"π° COST-EFFECTIVE STORAGE STRATEGY (Personal Budget)","position":2}]}</script><link rel=alternate type=application/rss+xml href=/blog/rss.xml title="Open Navigator RSS Feed"><link rel=alternate type=application/atom+xml href=/blog/atom.xml title="Open Navigator Atom Feed"><link rel=preconnect href=https://www.google-analytics.com><link rel=preconnect href=https://www.googletagmanager.com><script async src="https://www.googletagmanager.com/gtag/js?id=G-5EQV815915"></script><script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","G-5EQV815915",{anonymize_ip:!0})</script><link rel=stylesheet href=/assets/css/styles.c89d6b2d.css /><script src=/assets/js/runtime~main.c8fa085e.js defer></script><script src=/assets/js/main.6e24e536.js defer></script></head><body><svg style="display: none;"><defs> | |
| <symbol id=theme-svg-external-link viewBox="0 0 24 24"><path fill=currentColor d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"/></symbol> | |
| </defs></svg> | |
| <script>!function(){var t=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return window.localStorage.getItem("theme-7e9")}catch(t){}}();document.documentElement.setAttribute("data-theme",t||(window.matchMedia("(prefers-color-scheme: dark)").matches?"dark":"light")),document.documentElement.setAttribute("data-theme-choice",t||"system")}(),function(){try{for(var[t,e]of new URLSearchParams(window.location.search).entries())if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id=__docusaurus><link rel=preload as=image href=/img/communityone_logo.svg /><script type=application/ld+json>{"@context":"https://schema.org","@type":"Organization","address":{"@type":"PostalAddress","addressCountry":"US","addressLocality":"Tuscaloosa","addressRegion":"AL","postalCode":"35406","streetAddress":"5617 Lakeridge Court"},"contactPoint":{"@type":"ContactPoint","availableLanguage":["English"],"contactType":"Customer Service","email":"johnbowyer@communityone.com"},"description":"Track 90,000+ jurisdictions, 1.8M nonprofits, and analyze meeting minutes with AI. The open path to everything local.","email":"johnbowyer@communityone.com","legalName":"CommunityOne","logo":"https://www.communityone.com/img/communityone_logo.svg","name":"CommunityOne","sameAs":["https://www.facebook.com/communityone","https://www.instagram.com/communityone","https://twitter.com/communityone","https://www.linkedin.com/company/communityone","https://www.youtube.com/@communityone","https://discord.gg/communityone","https://github.com/getcommunityone/open-navigator"],"url":"https://www.communityone.com"}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"WebSite","alternateName":"CommunityOne Open Navigator","description":"AI-powered civic engagement platform tracking jurisdictions, nonprofits, and government meetings","name":"Open Navigator","potentialAction":{"@type":"SearchAction","query-input":"required name=search_term_string","target":{"@type":"EntryPoint","urlTemplate":"https://www.communityone.com/search?q={search_term_string}"}},"url":"https://www.communityone.com"}</script><script type=application/ld+json>{"@context":"https://schema.org","@type":"SoftwareApplication","aggregateRating":{"@type":"AggregateRating","ratingCount":"1","ratingValue":"5"},"applicationCategory":"BusinessApplication","description":"Track 90,000+ jurisdictions, 1.8M nonprofits, and analyze meeting minutes with AI","featureList":["Track 90,000+ jurisdictions","Monitor 1.8M nonprofits","Analyze meeting minutes","Legislative bill tracking","Campaign finance data"],"name":"Open Navigator","offers":{"@type":"Offer","price":"0","priceCurrency":"USD"},"operatingSystem":"Web","screenshot":"https://www.communityone.com/img/docusaurus-social-card.jpg","softwareVersion":"1.0.0"}</script><div role=region aria-label="Skip to main content"><a class=skipToContent_fXgn href=#__docusaurus_skipToContent_fallback>Skip to main content</a></div><nav aria-label=Main class="theme-layout-navbar navbar navbar--fixed-top"><div class=navbar__inner><div class="theme-layout-navbar-left navbar__items"><button aria-label="Toggle navigation bar" aria-expanded=false class="navbar__toggle clean-btn" type=button><svg width=30 height=30 viewBox="0 0 30 30" aria-hidden=true><path stroke=currentColor stroke-linecap=round stroke-miterlimit=10 stroke-width=2 d="M4 7h22M4 15h22M4 23h22"/></svg></button><a href=https://www.communityone.com target=_self rel="noopener noreferrer" class=navbar__brand><div class=navbar__logo><img src=/img/communityone_logo.svg alt="CommunityOne Logo" class="themedComponent_mlkZ themedComponent--light_NVdE"/><img src=/img/communityone_logo.svg alt="CommunityOne Logo" class="themedComponent_mlkZ themedComponent--dark_xIcU"/></div><b class="navbar__title text--truncate">Open Navigator Home</b></a><a class="navbar__item navbar__link" href=/docs/intro>Getting Started</a><a class="navbar__item navbar__link" href=/docs/for-families>Families & Individuals</a><a class="navbar__item navbar__link" href=/docs/for-advocates>Policy Makers</a><a class="navbar__item navbar__link" href=/docs/for-developers>Developers</a><a class="navbar__item navbar__link" href=/docs/data-sources/citations>Data and Terms</a><a class="navbar__item navbar__link" href=/blog>Blog</a></div><div class="theme-layout-navbar-right navbar__items navbar__items--right"><a href=https://github.com/getcommunityone/open-navigator-for-engagement target=_blank rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type=button disabled title="system mode" aria-label="Switch between dark and light mode (currently system mode)"><svg viewBox="0 0 24 24" width=24 height=24 aria-hidden=true class="toggleIcon_g3eP lightToggleIcon_pyhR"><path fill=currentColor d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"/></svg><svg viewBox="0 0 24 24" width=24 height=24 aria-hidden=true class="toggleIcon_g3eP darkToggleIcon_wfgR"><path fill=currentColor d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"/></svg><svg viewBox="0 0 24 24" width=24 height=24 aria-hidden=true class="toggleIcon_g3eP systemToggleIcon_QzmC"><path fill=currentColor d="m12 21c4.971 0 9-4.029 9-9s-4.029-9-9-9-9 4.029-9 9 4.029 9 9 9zm4.95-13.95c1.313 1.313 2.05 3.093 2.05 4.95s-0.738 3.637-2.05 4.95c-1.313 1.313-3.093 2.05-4.95 2.05v-14c1.857 0 3.637 0.737 4.95 2.05z"/></svg></button></div><div class=navbarSearchContainer_Bca1></div></div></div><div role=presentation class=navbar-sidebar__backdrop></div></nav><div id=__docusaurus_skipToContent_fallback class="theme-layout-main main-wrapper mainWrapper_z2l0"><div class=docsWrapper_hBAB><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type=button></button><div class=docRoot_UBD9><aside class="theme-doc-sidebar-container docSidebarContainer_YfHR"><div class=sidebarViewport_aRkj><div class=sidebar_njMd><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=true href=/docs/intro><span title="Getting Started" class=categoryLinkLabel_W154>Getting Started</span></a></div><ul class=menu__list><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class=menu__link tabindex=0 href=/docs/intro><span title=Introduction class=linkLabel_WmDU>Introduction</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class=menu__link tabindex=0 href=/docs/open-navigator><span title="Open Navigator" class=linkLabel_WmDU>Open Navigator</span></a></ul><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist" href=/docs/for-families><span title="Families & Individuals" class=categoryLinkLabel_W154>Families & Individuals</span></a><button aria-label="Collapse sidebar category 'Families & Individuals'" aria-expanded=true type=button class="clean-btn menu__caret"></button></div><ul class=menu__list><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/families/community-events><span title="Resources for Families" class=categoryLinkLabel_W154>Resources for Families</span></a></div><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class=menu__link tabindex=0 href=/docs/open-navigator><span title="Getting Started with Open Navigator" class=linkLabel_WmDU>Getting Started with Open Navigator</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class=menu__link tabindex=0 href=/docs/data-sources/citations><span title="Data and Citations" class=linkLabel_WmDU>Data and Citations</span></a></ul><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist" href=/docs/for-advocates><span title="Policy Makers & Advocates" class=categoryLinkLabel_W154>Policy Makers & Advocates</span></a><button aria-label="Collapse sidebar category 'Policy Makers & Advocates'" aria-expanded=true type=button class="clean-btn menu__caret"></button></div><ul class=menu__list><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/data-sources/overview><span title="Understanding the Data" class=categoryLinkLabel_W154>Understanding the Data</span></a></div><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/guides/political-economy><span title="Analysis & Strategy" class=categoryLinkLabel_W154>Analysis & Strategy</span></a></div><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/case-studies/tuscaloosa-complete><span title="Real-World Examples" class=categoryLinkLabel_W154>Real-World Examples</span></a></div></ul><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--active" href=/docs/for-developers><span title="Developers & Technical Users" class=categoryLinkLabel_W154>Developers & Technical Users</span></a><button aria-label="Collapse sidebar category 'Developers & Technical Users'" aria-expanded=true type=button class="clean-btn menu__caret"></button></div><ul class=menu__list><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/quickstart><span title="Setup & Installation" class=categoryLinkLabel_W154>Setup & Installation</span></a></div><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/data-sources/citations><span title="Data Sources (Technical)" class=categoryLinkLabel_W154>Data Sources (Technical)</span></a></div><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/guides/jurisdiction-setup><span title="How-To Guides" class=categoryLinkLabel_W154>How-To Guides</span></a></div><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/integrations/mcp-server><span title=Integrations class=categoryLinkLabel_W154>Integrations</span></a></div><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret menu__link--active" role=button aria-expanded=true tabindex=0 href=/docs/deployment/databricks-apps><span title=Deployment class=categoryLinkLabel_W154>Deployment</span></a></div><ul class=menu__list><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/databricks-apps><span title="Databricks Apps Deployment Guide" class=linkLabel_WmDU>Databricks Apps Deployment Guide</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/databricks-migration><span title="Databricks Agent Bricks Refactoring - Summary" class=linkLabel_WmDU>Databricks Agent Bricks Refactoring - Summary</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/quickstart-databricks><span title="Quick Start Guide - React + FastAPI Databricks App" class=linkLabel_WmDU>Quick Start Guide - React + FastAPI Databricks App</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/huggingface-spaces><span title="Hugging Face Spaces Deployment" class=linkLabel_WmDU>Hugging Face Spaces Deployment</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/oauth-providers-setup><span title="OAuth Providers Setup" class=linkLabel_WmDU>OAuth Providers Setup</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/authentication-setup><span title="Authentication Setup Guide" class=linkLabel_WmDU>Authentication Setup Guide</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/schema-migration><span title="Schema Migration Guide" class=linkLabel_WmDU>Schema Migration Guide</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/build-verification><span title="Build Verification & CI/CD" class=linkLabel_WmDU>Build Verification & CI/CD</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/variable-migration><span title="π Variable Name Migration Guide" class=linkLabel_WmDU>π Variable Name Migration Guide</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/d-drive-configuration><span title="D Drive Configuration for Large Datasets" class=linkLabel_WmDU>D Drive Configuration for Large Datasets</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/docker-troubleshooting><span title="π Docker Build Troubleshooting Guide" class=linkLabel_WmDU>π Docker Build Troubleshooting Guide</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/build-protection><span title="Build Protection & CI/CD" class=linkLabel_WmDU>Build Protection & CI/CD</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/rename-repository><span title="Rename Repository & Make Public" class=linkLabel_WmDU>Rename Repository & Make Public</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/costs><span title="π° Cost Breakdown: $0 for Data Access" class=linkLabel_WmDU>π° Cost Breakdown: $0 for Data Access</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/jurisdiction-discovery><span title="Jurisdiction Discovery - Deployment Options" class=linkLabel_WmDU>Jurisdiction Discovery - Deployment Options</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class=menu__link tabindex=0 href=/docs/deployment/scale><span title="π RUNNING DISCOVERY FOR ALL U.S. CITIES AND COUNTIES" class=linkLabel_WmDU>π RUNNING DISCOVERY FOR ALL U.S. CITIES AND COUNTIES</span></a><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-3 menu__list-item"><a class="menu__link menu__link--active" aria-current=page tabindex=0 href=/docs/deployment/storage><span title="π° COST-EFFECTIVE STORAGE STRATEGY (Personal Budget)" class=linkLabel_WmDU>π° COST-EFFECTIVE STORAGE STRATEGY (Personal Budget)</span></a></ul><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-2 menu__list-item menu__list-item--collapsed"><div class=menu__list-item-collapsible><a class="categoryLink_byQd menu__link menu__link--sublist menu__link--sublist-caret" role=button aria-expanded=false tabindex=0 href=/docs/development/database-setup><span title=Development class=categoryLinkLabel_W154>Development</span></a></div></ul></ul></nav></div></div></aside><main class=docMainContainer_TBSr><div class="container padding-top--md padding-bottom--lg"><div class=row><div class="col docItemCol_VOVn"><div class=docItemContainer_Djhp><article><nav class="theme-doc-breadcrumbs breadcrumbsContainer_Z_bl" aria-label=Breadcrumbs><ul class=breadcrumbs><li class=breadcrumbs__item><a aria-label="Home page" class=breadcrumbs__link href=/><svg viewBox="0 0 24 24" class=breadcrumbHomeIcon_YNFT><path d="M10 19v-5h4v5c0 .55.45 1 1 1h3c.55 0 1-.45 1-1v-7h1.7c.46 0 .68-.57.33-.87L12.67 3.6c-.38-.34-.96-.34-1.34 0l-8.36 7.53c-.34.3-.13.87.33.87H5v7c0 .55.45 1 1 1h3c.55 0 1-.45 1-1z" fill=currentColor /></svg></a><li class=breadcrumbs__item><a class=breadcrumbs__link href=/docs/for-developers><span>Developers & Technical Users</span></a><li class=breadcrumbs__item><span class=breadcrumbs__link>Deployment</span><li class="breadcrumbs__item breadcrumbs__item--active"><span class=breadcrumbs__link>π° COST-EFFECTIVE STORAGE STRATEGY (Personal Budget)</span></ul></nav><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type=button class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>π° COST-EFFECTIVE STORAGE STRATEGY (Personal Budget)</h1></header> | |
| <p><strong>TL;DR: Use Hugging Face Datasets - it's FREE and unlimited for public data!</strong></p> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-the-problem>π― THE PROBLEM<a href=#-the-problem class=hash-link aria-label="Direct link to π― THE PROBLEM" title="Direct link to π― THE PROBLEM" translate=no>β</a></h2> | |
| <p><strong>Challenge:</strong></p> | |
| <ul> | |
| <li class="">Need to process 22,000+ jurisdictions</li> | |
| <li class="">Each jurisdiction has: agendas, minutes, videos, social media</li> | |
| <li class="">Estimated total: <strong>10-50 TB</strong> of raw content</li> | |
| <li class="">Limited local storage + personal budget</li> | |
| </ul> | |
| <p><strong>Solution: Don't store everything locally!</strong></p> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-recommended-strategy-hugging-face-datasets>β RECOMMENDED STRATEGY: HUGGING FACE DATASETS<a href=#-recommended-strategy-hugging-face-datasets class=hash-link aria-label="Direct link to β RECOMMENDED STRATEGY: HUGGING FACE DATASETS" title="Direct link to β RECOMMENDED STRATEGY: HUGGING FACE DATASETS" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=why-hugging-face>Why Hugging Face?<a href=#why-hugging-face class=hash-link aria-label="Direct link to Why Hugging Face?" title="Direct link to Why Hugging Face?" translate=no>β</a></h3> | |
| <ol> | |
| <li class=""><strong>π FREE</strong> - Unlimited storage for public datasets</li> | |
| <li class=""><strong>π Cloud-based</strong> - No local storage needed</li> | |
| <li class=""><strong>π Versioned</strong> - Git-based dataset management</li> | |
| <li class=""><strong>π Searchable</strong> - Built-in search and filtering</li> | |
| <li class=""><strong>π€ Shareable</strong> - Public datasets help research community</li> | |
| <li class=""><strong>β‘ Fast</strong> - Optimized for large datasets</li> | |
| </ol> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=οΈ-critical-file-limits>β οΈ CRITICAL: File Limits<a href=#οΈ-critical-file-limits class=hash-link aria-label="Direct link to β οΈ CRITICAL: File Limits" title="Direct link to β οΈ CRITICAL: File Limits" translate=no>β</a></h3> | |
| <p><strong>Hugging Face has repository limits:</strong></p> | |
| <ul> | |
| <li class="">Files per folder: <10,000</li> | |
| <li class="">Total files per repo: <100,000</li> | |
| <li class="">Large datasets: Use Parquet or WebDataset format</li> | |
| </ul> | |
| <p><strong>Your scale (22M files) exceeds limits!</strong></p> | |
| <p><strong>Solution: Use Parquet format</strong></p> | |
| <ul> | |
| <li class="">22 million PDFs β 50 Parquet files β </li> | |
| <li class="">See detailed guide: <a class="" href=/docs/deployment/HUGGINGFACE_FILE_LIMITS.md>HUGGINGFACE_FILE_LIMITS.md</a></li> | |
| </ul> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=what-to-store>What to Store<a href=#what-to-store class=hash-link aria-label="Direct link to What to Store" title="Direct link to What to Store" translate=no>β</a></h3> | |
| <p><strong>Store ONLY processed/filtered data, not raw content:</strong></p> | |
| <p>β <strong>Store:</strong></p> | |
| <ul> | |
| <li class="">Extracted text from PDFs</li> | |
| <li class="">Meeting metadata (date, title, URL)</li> | |
| <li class="">Oral health-related snippets</li> | |
| <li class="">Social media links</li> | |
| <li class="">Discovery results (JSON)</li> | |
| </ul> | |
| <p>β <strong>Don't Store:</strong></p> | |
| <ul> | |
| <li class="">Full video files (link to YouTube instead)</li> | |
| <li class="">Full PDF files (store text + source URL)</li> | |
| <li class="">Website HTML dumps</li> | |
| <li class="">Duplicate content</li> | |
| </ul> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-storage-estimates>π STORAGE ESTIMATES<a href=#-storage-estimates class=hash-link aria-label="Direct link to π STORAGE ESTIMATES" title="Direct link to π STORAGE ESTIMATES" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=raw-content-dont-download-all>Raw Content (DON'T download all):<a href=#raw-content-dont-download-all class=hash-link aria-label="Direct link to Raw Content (DON'T download all):" title="Direct link to Raw Content (DON'T download all):" translate=no>β</a></h3> | |
| <div class="language-text codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-text codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token plain">Videos: 5,000 channels Γ 100 videos Γ 500 MB = 250 TB β</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">PDFs: 15,000 jurisdictions Γ 1,000 docs Γ 2 MB = 30 TB β</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">Social media: 18,000 accounts Γ archives = 5 TB β</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">TOTAL RAW: ~285 TB π« TOO EXPENSIVE!</span><br/></div></code></pre></div></div> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=processed-content-hugging-face-approach>Processed Content (Hugging Face approach):<a href=#processed-content-hugging-face-approach class=hash-link aria-label="Direct link to Processed Content (Hugging Face approach):" title="Direct link to Processed Content (Hugging Face approach):" translate=no>β</a></h3> | |
| <div class="language-text codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-text codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token plain">Discovery data: 22,000 jurisdictions Γ 50 KB = 1.1 GB β </span><br/></div><div class=token-line style=color:#393A34><span class="token plain">Meeting metadata: 500,000 meetings Γ 5 KB = 2.5 GB β </span><br/></div><div class=token-line style=color:#393A34><span class="token plain">Extracted text: 500,000 docs Γ 50 KB = 25 GB β </span><br/></div><div class=token-line style=color:#393A34><span class="token plain">Oral health subset: 50,000 relevant docs Γ 100 KB = 5 GB β </span><br/></div><div class=token-line style=color:#393A34><span class="token plain">TOTAL PROCESSED: ~34 GB β TOTALLY FREE on Hugging Face!</span><br/></div></code></pre></div></div> | |
| <p><strong>Savings: 285 TB β 34 GB = 99.99% reduction!</strong></p> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-step-by-step-hugging-face-workflow>π STEP-BY-STEP: HUGGING FACE WORKFLOW<a href=#-step-by-step-hugging-face-workflow class=hash-link aria-label="Direct link to π STEP-BY-STEP: HUGGING FACE WORKFLOW" title="Direct link to π STEP-BY-STEP: HUGGING FACE WORKFLOW" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=step-1-create-free-hugging-face-account>Step 1: Create Free Hugging Face Account<a href=#step-1-create-free-hugging-face-account class=hash-link aria-label="Direct link to Step 1: Create Free Hugging Face Account" title="Direct link to Step 1: Create Free Hugging Face Account" translate=no>β</a></h3> | |
| <div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token plain"># Sign up at https://huggingface.co/join</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># Create account (FREE)</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># Get your access token from https://huggingface.co/settings/tokens</span><br/></div></code></pre></div></div> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=step-2-install-hugging-face-libraries>Step 2: Install Hugging Face Libraries<a href=#step-2-install-hugging-face-libraries class=hash-link aria-label="Direct link to Step 2: Install Hugging Face Libraries" title="Direct link to Step 2: Install Hugging Face Libraries" translate=no>β</a></h3> | |
| <div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token plain">pip install huggingface_hub datasets</span><br/></div></code></pre></div></div> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=step-3-create-your-dataset>Step 3: Create Your Dataset<a href=#step-3-create-your-dataset class=hash-link aria-label="Direct link to Step 3: Create Your Dataset" title="Direct link to Step 3: Create Your Dataset" translate=no>β</a></h3> | |
| <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> huggingface_hub </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> HfApi</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> create_repo</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> datasets </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> Dataset</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> pandas </span><span class="token keyword" style=color:#00009f>as</span><span class="token plain"> pd</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token comment" style=color:#999988;font-style:italic># Login</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> huggingface_hub </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> login</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">login</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">token</span><span class="token operator" style=color:#393A34>=</span><span class="token string" style=color:#e3116c>"hf_YOUR_TOKEN"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Get from https://huggingface.co/settings/tokens</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token comment" style=color:#999988;font-style:italic># Create dataset repository</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">repo_name </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> </span><span class="token string" style=color:#e3116c>"oral-health-policy-data"</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">create_repo</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> repo_id</span><span class="token operator" style=color:#393A34>=</span><span class="token string-interpolation string" style=color:#e3116c>f"your-username/</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation">repo_name</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c>"</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> repo_type</span><span class="token operator" style=color:#393A34>=</span><span class="token string" style=color:#e3116c>"dataset"</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> private</span><span class="token operator" style=color:#393A34>=</span><span class="token boolean" style=color:#36acaa>False</span><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Public = FREE unlimited storage!</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token comment" style=color:#999988;font-style:italic># Upload discovery results</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">df </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> pd</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">read_csv</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string" style=color:#e3116c>'data/bronze/discovered_sources/discovery_summary_final.csv'</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">dataset </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> Dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">from_pandas</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">df</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">push_to_hub</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string-interpolation string" style=color:#e3116c>f"your-username/</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation">repo_name</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c>"</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> split</span><span class="token operator" style=color:#393A34>=</span><span class="token string" style=color:#e3116c>"discovery"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>print</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string" style=color:#e3116c>"β Dataset uploaded to Hugging Face!"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>print</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string-interpolation string" style=color:#e3116c>f"View at: https://huggingface.co/datasets/your-username/</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation">repo_name</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c>"</span><span class="token punctuation" style=color:#393A34>)</span><br/></div></code></pre></div></div> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=step-4-process-and-upload-pipeline>Step 4: Process-and-Upload Pipeline<a href=#step-4-process-and-upload-pipeline class=hash-link aria-label="Direct link to Step 4: Process-and-Upload Pipeline" title="Direct link to Step 4: Process-and-Upload Pipeline" translate=no>β</a></h3> | |
| <p><strong>DON'T download everything locally first!</strong></p> | |
| <p>Instead, use this streaming approach:</p> | |
| <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> httpx</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> tempfile</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> pathlib </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> Path</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>async</span><span class="token plain"> </span><span class="token keyword" style=color:#00009f>def</span><span class="token plain"> </span><span class="token function" style=color:#d73a49>process_jurisdiction_streaming</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">jurisdiction</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token triple-quoted-string string" style=color:#e3116c>"""</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c> Process jurisdiction WITHOUT storing locally:</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c> 1. Download agenda PDF</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c> 2. Extract text</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c> 3. Filter for oral health keywords</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c> 4. Upload to Hugging Face</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c> 5. Delete local file</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c> """</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> results </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> </span><span class="token punctuation" style=color:#393A34>[</span><span class="token punctuation" style=color:#393A34>]</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Get agenda portal URLs</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> agendas </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> jurisdiction</span><span class="token punctuation" style=color:#393A34>[</span><span class="token string" style=color:#e3116c>'agenda_portals'</span><span class="token punctuation" style=color:#393A34>]</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>for</span><span class="token plain"> agenda_url </span><span class="token keyword" style=color:#00009f>in</span><span class="token plain"> agendas</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Download to temporary file</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>with</span><span class="token plain"> tempfile</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">NamedTemporaryFile</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">delete</span><span class="token operator" style=color:#393A34>=</span><span class="token boolean" style=color:#36acaa>False</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> suffix</span><span class="token operator" style=color:#393A34>=</span><span class="token string" style=color:#e3116c>'.pdf'</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"> </span><span class="token keyword" style=color:#00009f>as</span><span class="token plain"> tmp</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>async</span><span class="token plain"> </span><span class="token keyword" style=color:#00009f>with</span><span class="token plain"> httpx</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">AsyncClient</span><span class="token punctuation" style=color:#393A34>(</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"> </span><span class="token keyword" style=color:#00009f>as</span><span class="token plain"> client</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> response </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> </span><span class="token keyword" style=color:#00009f>await</span><span class="token plain"> client</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">get</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">agenda_url</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> tmp</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">write</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">response</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">content</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> tmp_path </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> tmp</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">name</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Extract text (using PyPDF2 or similar)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> text </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> extract_text_from_pdf</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">tmp_path</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Filter for oral health content</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> keywords </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> </span><span class="token punctuation" style=color:#393A34>[</span><span class="token string" style=color:#e3116c>'fluoride'</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> </span><span class="token string" style=color:#e3116c>'dental'</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> </span><span class="token string" style=color:#e3116c>'oral health'</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> </span><span class="token string" style=color:#e3116c>'water treatment'</span><span class="token punctuation" style=color:#393A34>]</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>if</span><span class="token plain"> </span><span class="token builtin">any</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">kw </span><span class="token keyword" style=color:#00009f>in</span><span class="token plain"> text</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">lower</span><span class="token punctuation" style=color:#393A34>(</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"> </span><span class="token keyword" style=color:#00009f>for</span><span class="token plain"> kw </span><span class="token keyword" style=color:#00009f>in</span><span class="token plain"> keywords</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> results</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">append</span><span class="token punctuation" style=color:#393A34>(</span><span class="token punctuation" style=color:#393A34>{</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token string" style=color:#e3116c>'jurisdiction'</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"> jurisdiction</span><span class="token punctuation" style=color:#393A34>[</span><span class="token string" style=color:#e3116c>'name'</span><span class="token punctuation" style=color:#393A34>]</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token string" style=color:#e3116c>'state'</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"> jurisdiction</span><span class="token punctuation" style=color:#393A34>[</span><span class="token string" style=color:#e3116c>'state'</span><span class="token punctuation" style=color:#393A34>]</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token string" style=color:#e3116c>'url'</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"> agenda_url</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token string" style=color:#e3116c>'text'</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"> text</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token string" style=color:#e3116c>'date'</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"> extract_date</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">text</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token string" style=color:#e3116c>'relevant'</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"> </span><span class="token boolean" style=color:#36acaa>True</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token punctuation" style=color:#393A34>}</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Delete local file immediately</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> Path</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">tmp_path</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">unlink</span><span class="token punctuation" style=color:#393A34>(</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Upload batch to Hugging Face</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>if</span><span class="token plain"> results</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> upload_to_huggingface</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">results</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>return</span><span class="token plain"> </span><span class="token builtin">len</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">results</span><span class="token punctuation" style=color:#393A34>)</span><br/></div></code></pre></div></div> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-cost-breakdown-free-options>π‘ COST BREAKDOWN: FREE OPTIONS<a href=#-cost-breakdown-free-options class=hash-link aria-label="Direct link to π‘ COST BREAKDOWN: FREE OPTIONS" title="Direct link to π‘ COST BREAKDOWN: FREE OPTIONS" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=option-1-hugging-face-recommended>Option 1: Hugging Face (RECOMMENDED)<a href=#option-1-hugging-face-recommended class=hash-link aria-label="Direct link to Option 1: Hugging Face (RECOMMENDED)" title="Direct link to Option 1: Hugging Face (RECOMMENDED)" translate=no>β</a></h3> | |
| <table><thead><tr><th>Item<th>Cost<th>Storage<tbody><tr><td><strong>Public datasets</strong><td><strong>FREE</strong><td><strong>UNLIMITED</strong><tr><td>Private datasets<td>FREE<td>100 GB<tr><td>Bandwidth<td>FREE<td>Unlimited downloads<tr><td>Processing<td>FREE<td>Use local computer</table> | |
| <p><strong>Total: $0/month</strong> β </p> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=option-2-github--hugging-face>Option 2: GitHub + Hugging Face<a href=#option-2-github--hugging-face class=hash-link aria-label="Direct link to Option 2: GitHub + Hugging Face" title="Direct link to Option 2: GitHub + Hugging Face" translate=no>β</a></h3> | |
| <table><thead><tr><th>Item<th>Cost<th>Storage<tbody><tr><td>GitHub (discovery data)<td>FREE<td>1 GB<tr><td>Hugging Face (processed text)<td>FREE<td>Unlimited<tr><td>GitHub LFS (large files)<td>$5/month<td>50 GB</table> | |
| <p><strong>Total: $0-5/month</strong> β </p> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=option-3-cloud-storage-if-needed>Option 3: Cloud Storage (if needed)<a href=#option-3-cloud-storage-if-needed class=hash-link aria-label="Direct link to Option 3: Cloud Storage (if needed)" title="Direct link to Option 3: Cloud Storage (if needed)" translate=no>β</a></h3> | |
| <p><strong>Only for temporary processing:</strong></p> | |
| <table><thead><tr><th>Provider<th>Free Tier<th>After Free Tier<tbody><tr><td><strong>AWS S3</strong><td>5 GB for 12 months<td>$0.023/GB/month<tr><td><strong>Google Cloud</strong><td>5 GB always free<td>$0.020/GB/month<tr><td><strong>Azure Blob</strong><td>5 GB for 12 months<td>$0.018/GB/month</table> | |
| <p><strong>Cost for 34 GB:</strong> ~$0.60/month β </p> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-recommended-workflow>π― RECOMMENDED WORKFLOW<a href=#-recommended-workflow class=hash-link aria-label="Direct link to π― RECOMMENDED WORKFLOW" title="Direct link to π― RECOMMENDED WORKFLOW" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=phase-1-discovery-run-locally>Phase 1: Discovery (Run Locally)<a href=#phase-1-discovery-run-locally class=hash-link aria-label="Direct link to Phase 1: Discovery (Run Locally)" title="Direct link to Phase 1: Discovery (Run Locally)" translate=no>β</a></h3> | |
| <div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token plain"># Run discovery for all jurisdictions</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">python discovery/comprehensive_discovery_pipeline.py --all</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># Output: ~1 GB of JSON/CSV (fits on laptop!)</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># Upload to Hugging Face immediately</span><br/></div></code></pre></div></div> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=phase-2-content-processing-stream--upload>Phase 2: Content Processing (Stream & Upload)<a href=#phase-2-content-processing-stream--upload class=hash-link aria-label="Direct link to Phase 2: Content Processing (Stream & Upload)" title="Direct link to Phase 2: Content Processing (Stream & Upload)" translate=no>β</a></h3> | |
| <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token comment" style=color:#999988;font-style:italic># For each jurisdiction:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>for</span><span class="token plain"> jurisdiction </span><span class="token keyword" style=color:#00009f>in</span><span class="token plain"> all_jurisdictions</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># 1. Download one PDF</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> pdf </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> download_pdf</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">jurisdiction</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">agenda_url</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># 2. Extract text</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> text </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> extract_text</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">pdf</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># 3. Check if oral health-related</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>if</span><span class="token plain"> is_relevant</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">text</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># 4. Upload to Hugging Face</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> upload_to_hf</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">text</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> metadata</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># 5. Delete local file</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> delete</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">pdf</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Local storage stays at ~100 MB (just temp files)!</span><br/></div></code></pre></div></div> | |
| <p><strong>Your laptop never stores more than a few hundred MB!</strong></p> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=phase-3-analysis-cloud-or-local>Phase 3: Analysis (Cloud or Local)<a href=#phase-3-analysis-cloud-or-local class=hash-link aria-label="Direct link to Phase 3: Analysis (Cloud or Local)" title="Direct link to Phase 3: Analysis (Cloud or Local)" translate=no>β</a></h3> | |
| <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token comment" style=color:#999988;font-style:italic># Download ONLY relevant subset from Hugging Face</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> datasets </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> load_dataset</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token comment" style=color:#999988;font-style:italic># Load just oral health documents</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">dataset </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> load_dataset</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string" style=color:#e3116c>"your-username/oral-health-policy-data"</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> split</span><span class="token operator" style=color:#393A34>=</span><span class="token string" style=color:#e3116c>"relevant"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token comment" style=color:#999988;font-style:italic># This might be only 5 GB (totally manageable!)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>print</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string-interpolation string" style=color:#e3116c>f"Total documents: </span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation builtin">len</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>(</span><span class="token string-interpolation interpolation">dataset</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>)</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c>"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token comment" style=color:#999988;font-style:italic># Analyze locally or in Colab (FREE GPU!)</span><br/></div></code></pre></div></div> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-free-resources-you-can-use>π FREE RESOURCES YOU CAN USE<a href=#-free-resources-you-can-use class=hash-link aria-label="Direct link to π FREE RESOURCES YOU CAN USE" title="Direct link to π FREE RESOURCES YOU CAN USE" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=1-hugging-face-datasets>1. Hugging Face Datasets<a href=#1-hugging-face-datasets class=hash-link aria-label="Direct link to 1. Hugging Face Datasets" title="Direct link to 1. Hugging Face Datasets" translate=no>β</a></h3> | |
| <ul> | |
| <li class=""><strong>Storage:</strong> Unlimited (public datasets)</li> | |
| <li class=""><strong>Cost:</strong> FREE</li> | |
| <li class=""><strong>Use:</strong> Primary storage for all processed data</li> | |
| </ul> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=2-google-colab>2. Google Colab<a href=#2-google-colab class=hash-link aria-label="Direct link to 2. Google Colab" title="Direct link to 2. Google Colab" translate=no>β</a></h3> | |
| <ul> | |
| <li class=""><strong>Compute:</strong> FREE GPU/TPU (15 GB RAM)</li> | |
| <li class=""><strong>Cost:</strong> FREE (or $10/month for Pro)</li> | |
| <li class=""><strong>Use:</strong> Process PDFs, run analysis</li> | |
| <li class=""><strong>Storage:</strong> 15 GB on Google Drive (FREE)</li> | |
| </ul> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=3-github>3. GitHub<a href=#3-github class=hash-link aria-label="Direct link to 3. GitHub" title="Direct link to 3. GitHub" translate=no>β</a></h3> | |
| <ul> | |
| <li class=""><strong>Storage:</strong> 1 GB (100 GB with LFS for $5/month)</li> | |
| <li class=""><strong>Cost:</strong> FREE for public repos</li> | |
| <li class=""><strong>Use:</strong> Code + discovery results</li> | |
| </ul> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=4-internet-archive-archiveorg>4. Internet Archive (archive.org)<a href=#4-internet-archive-archiveorg class=hash-link aria-label="Direct link to 4. Internet Archive (archive.org)" title="Direct link to 4. Internet Archive (archive.org)" translate=no>β</a></h3> | |
| <ul> | |
| <li class=""><strong>Storage:</strong> Unlimited (for public documents)</li> | |
| <li class=""><strong>Cost:</strong> FREE</li> | |
| <li class=""><strong>Use:</strong> Mirror government documents</li> | |
| </ul> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-sample-upload-to-hugging-face>π¦ SAMPLE: UPLOAD TO HUGGING FACE<a href=#-sample-upload-to-hugging-face class=hash-link aria-label="Direct link to π¦ SAMPLE: UPLOAD TO HUGGING FACE" title="Direct link to π¦ SAMPLE: UPLOAD TO HUGGING FACE" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=create-upload-script>Create Upload Script<a href=#create-upload-script class=hash-link aria-label="Direct link to Create Upload Script" title="Direct link to Create Upload Script" translate=no>β</a></h3> | |
| <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token comment" style=color:#999988;font-style:italic>#!/usr/bin/env python3</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token triple-quoted-string string" style=color:#e3116c>"""</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c>upload_to_huggingface.py - Stream processed data to Hugging Face</span><br/></div><div class=token-line style=color:#393A34><span class="token triple-quoted-string string" style=color:#e3116c>"""</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> datasets </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> Dataset</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> DatasetDict</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> huggingface_hub </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> login</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> pandas </span><span class="token keyword" style=color:#00009f>as</span><span class="token plain"> pd</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> pathlib </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> Path</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token comment" style=color:#999988;font-style:italic># Configuration</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">HUGGINGFACE_TOKEN </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> </span><span class="token string" style=color:#e3116c>"hf_YOUR_TOKEN"</span><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># From https://huggingface.co/settings/tokens</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">HF_REPO </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> </span><span class="token string" style=color:#e3116c>"your-username/oral-health-policy-data"</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>def</span><span class="token plain"> </span><span class="token function" style=color:#d73a49>upload_discovery_results</span><span class="token punctuation" style=color:#393A34>(</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token triple-quoted-string string" style=color:#e3116c>"""Upload discovery results (JSON/CSV)"""</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> login</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">token</span><span class="token operator" style=color:#393A34>=</span><span class="token plain">HUGGINGFACE_TOKEN</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Load discovery data</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> discovery_dir </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> Path</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string" style=color:#e3116c>"data/bronze/discovered_sources"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Load all discovery CSVs</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> all_data </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> </span><span class="token punctuation" style=color:#393A34>[</span><span class="token punctuation" style=color:#393A34>]</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>for</span><span class="token plain"> csv_file </span><span class="token keyword" style=color:#00009f>in</span><span class="token plain"> discovery_dir</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">glob</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string" style=color:#e3116c>"*.csv"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> df </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> pd</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">read_csv</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">csv_file</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> all_data</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">append</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">df</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Combine and upload</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> combined </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> pd</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">concat</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">all_data</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> ignore_index</span><span class="token operator" style=color:#393A34>=</span><span class="token boolean" style=color:#36acaa>True</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> dataset </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> Dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">from_pandas</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">combined</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">push_to_hub</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">HF_REPO</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> split</span><span class="token operator" style=color:#393A34>=</span><span class="token string" style=color:#e3116c>"discovery"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>print</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string-interpolation string" style=color:#e3116c>f"β Uploaded </span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation builtin">len</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>(</span><span class="token string-interpolation interpolation">combined</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>)</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c> jurisdictions to Hugging Face"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>print</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string-interpolation string" style=color:#e3116c>f"View at: https://huggingface.co/datasets/</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation">HF_REPO</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c>"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>def</span><span class="token plain"> </span><span class="token function" style=color:#d73a49>upload_meeting_data</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">meetings_df</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token triple-quoted-string string" style=color:#e3116c>"""Upload processed meeting data"""</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Convert to dataset</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> dataset </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> Dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">from_pandas</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">meetings_df</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token comment" style=color:#999988;font-style:italic># Upload</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">push_to_hub</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">HF_REPO</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> split</span><span class="token operator" style=color:#393A34>=</span><span class="token string" style=color:#e3116c>"meetings"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>print</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string-interpolation string" style=color:#e3116c>f"β Uploaded </span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation builtin">len</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>(</span><span class="token string-interpolation interpolation">meetings_df</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>)</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c> meetings"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>def</span><span class="token plain"> </span><span class="token function" style=color:#d73a49>upload_oral_health_subset</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">filtered_df</span><span class="token punctuation" style=color:#393A34>)</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token triple-quoted-string string" style=color:#e3116c>"""Upload filtered oral health content"""</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> dataset </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> Dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">from_pandas</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">filtered_df</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">push_to_hub</span><span class="token punctuation" style=color:#393A34>(</span><span class="token plain">HF_REPO</span><span class="token punctuation" style=color:#393A34>,</span><span class="token plain"> split</span><span class="token operator" style=color:#393A34>=</span><span class="token string" style=color:#e3116c>"oral_health"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> </span><span class="token keyword" style=color:#00009f>print</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string-interpolation string" style=color:#e3116c>f"β Uploaded </span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation builtin">len</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>(</span><span class="token string-interpolation interpolation">filtered_df</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>)</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c> oral health documents"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>if</span><span class="token plain"> __name__ </span><span class="token operator" style=color:#393A34>==</span><span class="token plain"> </span><span class="token string" style=color:#e3116c>"__main__"</span><span class="token punctuation" style=color:#393A34>:</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"> upload_discovery_results</span><span class="token punctuation" style=color:#393A34>(</span><span class="token punctuation" style=color:#393A34>)</span><br/></div></code></pre></div></div> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=run-upload>Run Upload<a href=#run-upload class=hash-link aria-label="Direct link to Run Upload" title="Direct link to Run Upload" translate=no>β</a></h3> | |
| <div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token plain"># Set your token</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">export HUGGINGFACE_TOKEN="hf_YOUR_TOKEN"</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># Upload discovery results</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">python scripts/upload_to_huggingface.py</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># View your dataset</span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># https://huggingface.co/datasets/your-username/oral-health-policy-data</span><br/></div></code></pre></div></div> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-total-cost-estimate>π° TOTAL COST ESTIMATE<a href=#-total-cost-estimate class=hash-link aria-label="Direct link to π° TOTAL COST ESTIMATE" title="Direct link to π° TOTAL COST ESTIMATE" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=personal-budget-approach-recommended>Personal Budget Approach (RECOMMENDED)<a href=#personal-budget-approach-recommended class=hash-link aria-label="Direct link to Personal Budget Approach (RECOMMENDED)" title="Direct link to Personal Budget Approach (RECOMMENDED)" translate=no>β</a></h3> | |
| <table><thead><tr><th>Component<th>Cost<th>Notes<tbody><tr><td><strong>Hugging Face</strong><td><strong>$0/month</strong><td>Public datasets = FREE<tr><td><strong>Local computer</strong><td>$0/month<td>Use your laptop<tr><td><strong>Internet</strong><td>$0/month<td>Use existing connection<tr><td><strong>Google Colab</strong><td>$0/month<td>FREE tier (or $10/month Pro)<tr><td><strong>GitHub</strong><td>$0/month<td>Public repos FREE<tr><td><strong>TOTAL</strong><td><strong>$0/month</strong><td>β <strong>100% FREE!</strong></table> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=professional-approach-if-scaling-up>Professional Approach (if scaling up)<a href=#professional-approach-if-scaling-up class=hash-link aria-label="Direct link to Professional Approach (if scaling up)" title="Direct link to Professional Approach (if scaling up)" translate=no>β</a></h3> | |
| <table><thead><tr><th>Component<th>Cost<th>Notes<tbody><tr><td>Hugging Face Pro<td>$9/month<td>Faster processing<tr><td>Google Colab Pro<td>$10/month<td>More GPU time<tr><td>AWS S3 (50 GB)<td>$1/month<td>Temporary storage<tr><td><strong>TOTAL</strong><td><strong>$20/month</strong><td>Still very affordable</table> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-real-example-meetingbank-dataset>π REAL EXAMPLE: MeetingBank Dataset<a href=#-real-example-meetingbank-dataset class=hash-link aria-label="Direct link to π REAL EXAMPLE: MeetingBank Dataset" title="Direct link to π REAL EXAMPLE: MeetingBank Dataset" translate=no>β</a></h2> | |
| <p><strong>Existing dataset on Hugging Face:</strong></p> | |
| <ul> | |
| <li class="">Name: <code>huuuyeah/meetingbank</code></li> | |
| <li class="">Size: 1,366 meetings, 121 MB</li> | |
| <li class="">Cost: FREE</li> | |
| <li class="">Link: <a href=https://huggingface.co/datasets/huuuyeah/meetingbank target=_blank rel="noopener noreferrer" class="">https://huggingface.co/datasets/huuuyeah/meetingbank</a></li> | |
| </ul> | |
| <p><strong>You can do the same for oral health policy!</strong></p> | |
| <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-python codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token comment" style=color:#999988;font-style:italic># Load existing MeetingBank data (FREE)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>from</span><span class="token plain"> datasets </span><span class="token keyword" style=color:#00009f>import</span><span class="token plain"> load_dataset</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">meetingbank </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> load_dataset</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string" style=color:#e3116c>"huuuyeah/meetingbank"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token keyword" style=color:#00009f>print</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string-interpolation string" style=color:#e3116c>f"Meetings: </span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>{</span><span class="token string-interpolation interpolation builtin">len</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>(</span><span class="token string-interpolation interpolation">meetingbank</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>[</span><span class="token string-interpolation interpolation string" style=color:#e3116c>'train'</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>]</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>)</span><span class="token string-interpolation interpolation punctuation" style=color:#393A34>}</span><span class="token string-interpolation string" style=color:#e3116c>"</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"></span><span class="token comment" style=color:#999988;font-style:italic># Create YOUR oral health dataset (also FREE!)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">your_dataset </span><span class="token operator" style=color:#393A34>=</span><span class="token plain"> create_oral_health_dataset</span><span class="token punctuation" style=color:#393A34>(</span><span class="token punctuation" style=color:#393A34>)</span><span class="token plain"></span><br/></div><div class=token-line style=color:#393A34><span class="token plain">your_dataset</span><span class="token punctuation" style=color:#393A34>.</span><span class="token plain">push_to_hub</span><span class="token punctuation" style=color:#393A34>(</span><span class="token string" style=color:#e3116c>"your-username/oral-health-meetings"</span><span class="token punctuation" style=color:#393A34>)</span><br/></div></code></pre></div></div> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-action-plan-for-you>β ACTION PLAN FOR YOU<a href=#-action-plan-for-you class=hash-link aria-label="Direct link to β ACTION PLAN FOR YOU" title="Direct link to β ACTION PLAN FOR YOU" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=week-1-setup-cost-0>Week 1: Setup (Cost: $0)<a href=#week-1-setup-cost-0 class=hash-link aria-label="Direct link to Week 1: Setup (Cost: $0)" title="Direct link to Week 1: Setup (Cost: $0)" translate=no>β</a></h3> | |
| <ol> | |
| <li class="">β Create Hugging Face account (FREE)</li> | |
| <li class="">β Get API token</li> | |
| <li class="">β Install libraries: <code>pip install huggingface_hub datasets</code></li> | |
| <li class="">β Create dataset repo: <code>oral-health-policy-data</code></li> | |
| </ol> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=week-2-discovery-cost-0>Week 2: Discovery (Cost: $0)<a href=#week-2-discovery-cost-0 class=hash-link aria-label="Direct link to Week 2: Discovery (Cost: $0)" title="Direct link to Week 2: Discovery (Cost: $0)" translate=no>β</a></h3> | |
| <ol> | |
| <li class="">Run discovery pipeline for all 22,000 jurisdictions</li> | |
| <li class="">Upload discovery results to Hugging Face (~1 GB)</li> | |
| <li class="">Free up local storage</li> | |
| </ol> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=week-3-4-content-processing-cost-0>Week 3-4: Content Processing (Cost: $0)<a href=#week-3-4-content-processing-cost-0 class=hash-link aria-label="Direct link to Week 3-4: Content Processing (Cost: $0)" title="Direct link to Week 3-4: Content Processing (Cost: $0)" translate=no>β</a></h3> | |
| <ol> | |
| <li class="">Process jurisdictions one at a time (streaming)</li> | |
| <li class="">Extract text from PDFs</li> | |
| <li class="">Filter for oral health keywords</li> | |
| <li class="">Upload to Hugging Face</li> | |
| <li class="">Delete local files immediately</li> | |
| </ol> | |
| <p><strong>Local storage never exceeds 1 GB!</strong></p> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=ongoing-analysis-cost-0>Ongoing: Analysis (Cost: $0)<a href=#ongoing-analysis-cost-0 class=hash-link aria-label="Direct link to Ongoing: Analysis (Cost: $0)" title="Direct link to Ongoing: Analysis (Cost: $0)" translate=no>β</a></h3> | |
| <ol> | |
| <li class="">Download relevant subset from Hugging Face</li> | |
| <li class="">Analyze using Google Colab (FREE GPU)</li> | |
| <li class="">Publish findings back to Hugging Face</li> | |
| </ol> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-key-principles>π KEY PRINCIPLES<a href=#-key-principles class=hash-link aria-label="Direct link to π KEY PRINCIPLES" title="Direct link to π KEY PRINCIPLES" translate=no>β</a></h2> | |
| <p><strong>1. Process, Don't Store</strong></p> | |
| <ul> | |
| <li class="">Download β Process β Upload β Delete</li> | |
| <li class="">Never keep raw files locally</li> | |
| </ul> | |
| <p><strong>2. Filter Early</strong></p> | |
| <ul> | |
| <li class="">Only save oral health-related content</li> | |
| <li class="">Discard irrelevant documents immediately</li> | |
| </ul> | |
| <p><strong>3. Use Text, Not Files</strong></p> | |
| <ul> | |
| <li class="">Store extracted text (KB), not PDFs (MB)</li> | |
| <li class="">Link to original sources instead of duplicating</li> | |
| </ul> | |
| <p><strong>4. Leverage Free Platforms</strong></p> | |
| <ul> | |
| <li class="">Hugging Face for datasets (FREE)</li> | |
| <li class="">Google Colab for processing (FREE)</li> | |
| <li class="">GitHub for code (FREE)</li> | |
| </ul> | |
| <p><strong>5. Make It Public</strong></p> | |
| <ul> | |
| <li class="">Public datasets = unlimited FREE storage</li> | |
| <li class="">Helps other researchers</li> | |
| <li class="">Builds your portfolio</li> | |
| </ul> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-additional-free-resources>π ADDITIONAL FREE RESOURCES<a href=#-additional-free-resources class=hash-link aria-label="Direct link to π ADDITIONAL FREE RESOURCES" title="Direct link to π ADDITIONAL FREE RESOURCES" translate=no>β</a></h2> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=processing-tools-free>Processing Tools (FREE)<a href=#processing-tools-free class=hash-link aria-label="Direct link to Processing Tools (FREE)" title="Direct link to Processing Tools (FREE)" translate=no>β</a></h3> | |
| <div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style=--prism-color:#393A34;--prism-background-color:#f6f8fa><div class=codeBlockContent_QJqH><pre tabindex=0 class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style=color:#393A34;background-color:#f6f8fa><code class=codeBlockLines_e6Vv><div class=token-line style=color:#393A34><span class="token plain"># PDF text extraction</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">pip install pypdf2 pdfplumber</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># Document processing</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">pip install beautifulsoup4 lxml</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># Data handling</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">pip install pandas pyarrow</span><br/></div><div class=token-line style=color:#393A34><span class="token plain" style=display:inline-block></span><br/></div><div class=token-line style=color:#393A34><span class="token plain"># Upload to Hugging Face</span><br/></div><div class=token-line style=color:#393A34><span class="token plain">pip install huggingface_hub datasets</span><br/></div></code></pre></div></div> | |
| <h3 class="anchor anchorTargetStickyNavbar_Vzrq" id=computing-free>Computing (FREE)<a href=#computing-free class=hash-link aria-label="Direct link to Computing (FREE)" title="Direct link to Computing (FREE)" translate=no>β</a></h3> | |
| <ol> | |
| <li class=""> | |
| <p><strong>Google Colab</strong> - FREE GPU/TPU</p> | |
| <ul> | |
| <li class=""><a href=https://colab.research.google.com/ target=_blank rel="noopener noreferrer" class="">https://colab.research.google.com/</a></li> | |
| <li class="">15 GB RAM, 100 GB disk (temporary)</li> | |
| </ul> | |
| </li> | |
| <li class=""> | |
| <p><strong>Kaggle Notebooks</strong> - FREE GPU</p> | |
| <ul> | |
| <li class=""><a href=https://www.kaggle.com/code target=_blank rel="noopener noreferrer" class="">https://www.kaggle.com/code</a></li> | |
| <li class="">20 GB RAM, 73 GB disk (temporary)</li> | |
| </ul> | |
| </li> | |
| <li class=""> | |
| <p><strong>Hugging Face Spaces</strong> - FREE hosting</p> | |
| <ul> | |
| <li class=""><a href=https://huggingface.co/spaces target=_blank rel="noopener noreferrer" class="">https://huggingface.co/spaces</a></li> | |
| <li class="">Run demos and apps</li> | |
| </ul> | |
| </li> | |
| </ol> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-bottom-line>π― BOTTOM LINE<a href=#-bottom-line class=hash-link aria-label="Direct link to π― BOTTOM LINE" title="Direct link to π― BOTTOM LINE" translate=no>β</a></h2> | |
| <p><strong>YOU CAN DO THIS FOR $0/MONTH!</strong></p> | |
| <p>β <strong>Storage:</strong> Hugging Face (FREE, unlimited)<br/> | |
| <!-- -->β <strong>Processing:</strong> Local computer or Google Colab (FREE)<br/> | |
| <!-- -->β <strong>Code:</strong> GitHub (FREE)<br/> | |
| <!-- -->β <strong>Analysis:</strong> Google Colab (FREE GPU)</p> | |
| <p><strong>The entire 22,000-jurisdiction discovery and analysis can be done on a personal budget with ZERO cloud storage costs!</strong></p> | |
| <hr/> | |
| <h2 class="anchor anchorTargetStickyNavbar_Vzrq" id=-next-steps>π NEXT STEPS<a href=#-next-steps class=hash-link aria-label="Direct link to π NEXT STEPS" title="Direct link to π NEXT STEPS" translate=no>β</a></h2> | |
| <ol> | |
| <li class=""><strong>Create Hugging Face account:</strong> <a href=https://huggingface.co/join target=_blank rel="noopener noreferrer" class="">https://huggingface.co/join</a></li> | |
| <li class=""><strong>Create your dataset repo:</strong> <code>oral-health-policy-data</code></li> | |
| <li class=""><strong>Run discovery pipeline</strong> (outputs ~1 GB locally)</li> | |
| <li class=""><strong>Upload to Hugging Face</strong> (FREE unlimited storage)</li> | |
| <li class=""><strong>Process content streaming</strong> (never store >100 MB locally)</li> | |
| </ol> | |
| <p><strong>Questions?</strong> Check Hugging Face docs: <a href=https://huggingface.co/docs/datasets/ target=_blank rel="noopener noreferrer" class="">https://huggingface.co/docs/datasets/</a></div><footer class="theme-doc-footer docusaurus-mt-lg"><div class="row margin-top--sm theme-doc-footer-edit-meta-row"><div class="col noPrint_WFHX"><a href=https://github.com/getcommunityone/open-navigator-for-engagement/tree/main/website/docs/deployment/storage.md target=_blank rel="noopener noreferrer" class=theme-edit-this-page><svg fill=currentColor height=20 width=20 viewBox="0 0 40 40" class=iconEdit_Z9Sw aria-hidden=true><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"/></g></svg>Edit this page</a></div><div class="col lastUpdated_JAkA"></div></div></footer></article><nav class="docusaurus-mt-lg pagination-nav" aria-label="Docs pages"><a class="pagination-nav__link pagination-nav__link--prev" href=/docs/deployment/scale><div class=pagination-nav__sublabel>Previous</div><div class=pagination-nav__label>π RUNNING DISCOVERY FOR ALL U.S. CITIES AND COUNTIES</div></a><a class="pagination-nav__link pagination-nav__link--next" href=/docs/development/database-setup><div class=pagination-nav__sublabel>Next</div><div class=pagination-nav__label>Database Setup & Stats Verification</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href=#-the-problem class="table-of-contents__link toc-highlight">π― THE PROBLEM</a><li><a href=#-recommended-strategy-hugging-face-datasets class="table-of-contents__link toc-highlight">β RECOMMENDED STRATEGY: HUGGING FACE DATASETS</a><ul><li><a href=#why-hugging-face class="table-of-contents__link toc-highlight">Why Hugging Face?</a><li><a href=#οΈ-critical-file-limits class="table-of-contents__link toc-highlight">β οΈ CRITICAL: File Limits</a><li><a href=#what-to-store class="table-of-contents__link toc-highlight">What to Store</a></ul><li><a href=#-storage-estimates class="table-of-contents__link toc-highlight">π STORAGE ESTIMATES</a><ul><li><a href=#raw-content-dont-download-all class="table-of-contents__link toc-highlight">Raw Content (DON'T download all):</a><li><a href=#processed-content-hugging-face-approach class="table-of-contents__link toc-highlight">Processed Content (Hugging Face approach):</a></ul><li><a href=#-step-by-step-hugging-face-workflow class="table-of-contents__link toc-highlight">π STEP-BY-STEP: HUGGING FACE WORKFLOW</a><ul><li><a href=#step-1-create-free-hugging-face-account class="table-of-contents__link toc-highlight">Step 1: Create Free Hugging Face Account</a><li><a href=#step-2-install-hugging-face-libraries class="table-of-contents__link toc-highlight">Step 2: Install Hugging Face Libraries</a><li><a href=#step-3-create-your-dataset class="table-of-contents__link toc-highlight">Step 3: Create Your Dataset</a><li><a href=#step-4-process-and-upload-pipeline class="table-of-contents__link toc-highlight">Step 4: Process-and-Upload Pipeline</a></ul><li><a href=#-cost-breakdown-free-options class="table-of-contents__link toc-highlight">π‘ COST BREAKDOWN: FREE OPTIONS</a><ul><li><a href=#option-1-hugging-face-recommended class="table-of-contents__link toc-highlight">Option 1: Hugging Face (RECOMMENDED)</a><li><a href=#option-2-github--hugging-face class="table-of-contents__link toc-highlight">Option 2: GitHub + Hugging Face</a><li><a href=#option-3-cloud-storage-if-needed class="table-of-contents__link toc-highlight">Option 3: Cloud Storage (if needed)</a></ul><li><a href=#-recommended-workflow class="table-of-contents__link toc-highlight">π― RECOMMENDED WORKFLOW</a><ul><li><a href=#phase-1-discovery-run-locally class="table-of-contents__link toc-highlight">Phase 1: Discovery (Run Locally)</a><li><a href=#phase-2-content-processing-stream--upload class="table-of-contents__link toc-highlight">Phase 2: Content Processing (Stream & Upload)</a><li><a href=#phase-3-analysis-cloud-or-local class="table-of-contents__link toc-highlight">Phase 3: Analysis (Cloud or Local)</a></ul><li><a href=#-free-resources-you-can-use class="table-of-contents__link toc-highlight">π FREE RESOURCES YOU CAN USE</a><ul><li><a href=#1-hugging-face-datasets class="table-of-contents__link toc-highlight">1. Hugging Face Datasets</a><li><a href=#2-google-colab class="table-of-contents__link toc-highlight">2. Google Colab</a><li><a href=#3-github class="table-of-contents__link toc-highlight">3. GitHub</a><li><a href=#4-internet-archive-archiveorg class="table-of-contents__link toc-highlight">4. Internet Archive (archive.org)</a></ul><li><a href=#-sample-upload-to-hugging-face class="table-of-contents__link toc-highlight">π¦ SAMPLE: UPLOAD TO HUGGING FACE</a><ul><li><a href=#create-upload-script class="table-of-contents__link toc-highlight">Create Upload Script</a><li><a href=#run-upload class="table-of-contents__link toc-highlight">Run Upload</a></ul><li><a href=#-total-cost-estimate class="table-of-contents__link toc-highlight">π° TOTAL COST ESTIMATE</a><ul><li><a href=#personal-budget-approach-recommended class="table-of-contents__link toc-highlight">Personal Budget Approach (RECOMMENDED)</a><li><a href=#professional-approach-if-scaling-up class="table-of-contents__link toc-highlight">Professional Approach (if scaling up)</a></ul><li><a href=#-real-example-meetingbank-dataset class="table-of-contents__link toc-highlight">π REAL EXAMPLE: MeetingBank Dataset</a><li><a href=#-action-plan-for-you class="table-of-contents__link toc-highlight">β ACTION PLAN FOR YOU</a><ul><li><a href=#week-1-setup-cost-0 class="table-of-contents__link toc-highlight">Week 1: Setup (Cost: $0)</a><li><a href=#week-2-discovery-cost-0 class="table-of-contents__link toc-highlight">Week 2: Discovery (Cost: $0)</a><li><a href=#week-3-4-content-processing-cost-0 class="table-of-contents__link toc-highlight">Week 3-4: Content Processing (Cost: $0)</a><li><a href=#ongoing-analysis-cost-0 class="table-of-contents__link toc-highlight">Ongoing: Analysis (Cost: $0)</a></ul><li><a href=#-key-principles class="table-of-contents__link toc-highlight">π KEY PRINCIPLES</a><li><a href=#-additional-free-resources class="table-of-contents__link toc-highlight">π ADDITIONAL FREE RESOURCES</a><ul><li><a href=#processing-tools-free class="table-of-contents__link toc-highlight">Processing Tools (FREE)</a><li><a href=#computing-free class="table-of-contents__link toc-highlight">Computing (FREE)</a></ul><li><a href=#-bottom-line class="table-of-contents__link toc-highlight">π― BOTTOM LINE</a><li><a href=#-next-steps class="table-of-contents__link toc-highlight">π NEXT STEPS</a></ul></div></div></div></div></main></div></div></div><footer class="theme-layout-footer footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="theme-layout-footer-column col footer__col"><div class=footer__title>Documentation</div><ul class="footer__items clean-list"><li class=footer__item><a class=footer__link-item href=/docs/intro>Getting Started</a><li class=footer__item><a class=footer__link-item href=/docs/data-sources/citations>Citations & Data Sources</a><li class=footer__item><a class=footer__link-item href=/docs/data-sources/overview>Data Sources</a><li class=footer__item><a class=footer__link-item href=/docs/for-developers>For Developers</a></ul></div><div class="theme-layout-footer-column col footer__col"><div class=footer__title>Resources</div><ul class="footer__items clean-list"><li class=footer__item><a href=https://www.communityone.com target=_blank rel="noopener noreferrer" class=footer__link-item>Launch Open Navigator<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a><li class=footer__item><a href=https://github.com/getcommunityone/open-navigator-for-engagement target=_blank rel="noopener noreferrer" class=footer__link-item>GitHub<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a><li class=footer__item><a href=https://www.groundvue.org/ target=_blank rel="noopener noreferrer" class=footer__link-item>GroundVue (Partner)<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a></ul></div><div class="theme-layout-footer-column col footer__col"><div class=footer__title>Community</div><ul class="footer__items clean-list"><li class=footer__item><a href=https://www.instagram.com/getcommunityone/ target=_blank rel="noopener noreferrer" class=footer__link-item>Instagram<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a><li class=footer__item><a href=https://www.facebook.com/getcommunityone target=_blank rel="noopener noreferrer" class=footer__link-item>Facebook<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a><li class=footer__item><a href=https://x.com/getcommunityone/ target=_blank rel="noopener noreferrer" class=footer__link-item>X (Twitter)<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a><li class=footer__item><a href=https://www.linkedin.com/company/getcommunityone target=_blank rel="noopener noreferrer" class=footer__link-item>LinkedIn<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a><li class=footer__item><a href=https://www.youtube.com/@getcommunityone target=_blank rel="noopener noreferrer" class=footer__link-item>YouTube<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a><li class=footer__item><a href=https://discord.gg/uH6Dytek target=_blank rel="noopener noreferrer" class=footer__link-item>Discord<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a></ul></div><div class="theme-layout-footer-column col footer__col"><div class=footer__title>Legal</div><ul class="footer__items clean-list"><li class=footer__item><a class=footer__link-item href=/docs/legal/privacy-policy>Privacy Policy</a><li class=footer__item><a class=footer__link-item href=/docs/legal/terms-of-service>Terms of Service</a><li class=footer__item><a class=footer__link-item href=/docs/legal/data-provider-terms>Data Provider Terms</a></ul></div><div class="theme-layout-footer-column col footer__col"><div class=footer__title>More</div><ul class="footer__items clean-list"><li class=footer__item><a class=footer__link-item href=/blog>Blog</a><li class=footer__item><a href=https://github.com/getcommunityone/open-navigator-for-engagement/blob/main/LICENSE target=_blank rel="noopener noreferrer" class=footer__link-item>License (MIT)<svg width=13.5 height=13.5 aria-label="(opens in new tab)" class=iconExternalLink_nPIU><use href=#theme-svg-external-link /></svg></a></ul></div></div><div class="footer__bottom text--center"><div class=footer__copyright>Copyright Β© 2026 Community One. Built with Docusaurus.</div></div></div></footer></div></body> |