package main import ( "bytes" "compress/gzip" "context" "encoding/json" "fmt" "io" "log" "net/http" "os" "path/filepath" "slices" "sort" "strings" "sync" "time" "github.com/parquet-go/parquet-go" ) // RecipeRow holds all the scraped info for a single URL type RecipeRow struct { URL string `parquet:"url"` Title string `parquet:"title"` JsonLd string `parquet:"json_ld,zstd"` Html string `parquet:"html,zstd"` Sitemap string `parquet:"sitemap"` ScrapeTimestamp int64 `parquet:"scrape_timestamp"` JsonLdPresent bool `parquet:"json_ld_present"` HtmlRecipePresent bool `parquet:"html_recipe_present"` HttpStatusCode int `parquet:"http_status_code"` } // Minimal extracted recipe type Recipe struct { ID int `json:"id"` URL string `json:"url"` Title string `json:"title"` Text string `json:"text"` // combined description + instructions (searchable) Description string `json:"description"` Ingredients []string `json:"ingredients"` Instructions []string `json:"instructions"` RawJsonLd string `json:"raw_json_ld"` Author string `json:"author"` ImageURL string `json:"image_url"` Keywords []string `json:"keywords"` AggregateRating float64 `json:"aggregate_rating"` RatingCount int `json:"rating_count"` } var ( store = struct { sync.RWMutex list []Recipe tags map[string]int }{} ) // datasetURL points to the public parquet file on Hugging Face. It can be // overridden by setting the HF_DATASET_URL environment variable. var datasetURL = "https://huggingface.co/datasets/Wissotsky/HebrewRecipes/resolve/main/recipes.parquet" func main() { // ensure data dir exists os.MkdirAll("data", 0755) // Ensure parquet file is available; attempt download if missing. parquetPath := filepath.Join("data", "recipes.parquet") if err := fetchRecipesIfMissing(parquetPath); err != nil { log.Printf("warning: could not ensure parquet file: %v", err) } // Try loading regardless (may be missing if download failed) if err := loadParquet(parquetPath); err != nil { log.Printf("failed loading parquet: %v", err) } mux := http.NewServeMux() mux.HandleFunc("/", indexHandler) mux.HandleFunc("/tag", tagHandler) mux.HandleFunc("/search", searchHandler) mux.HandleFunc("/recipe/", recipeHandler) mux.Handle("/static/", http.StripPrefix("/static/", http.FileServer(http.Dir("static")))) addr := ":8080" log.Printf("Starting server on %s", addr) log.Fatal(http.ListenAndServe(addr, mux)) } func indexHandler(w http.ResponseWriter, r *http.Request) { // show top N recipes store.RLock() list := store.list tags := store.tags store.RUnlock() //if len(list) > 50 { // list = list[:50] //} component := Index(list, tags) component.Render(r.Context(), w) } func searchHandler(w http.ResponseWriter, r *http.Request) { q := strings.TrimSpace(r.URL.Query().Get("q")) results := []Recipe{} tags := map[string]int{} if q != "" { ql := strings.ToLower(q) store.RLock() tags = store.tags for _, rec := range store.list { if strings.Contains(strings.ToLower(rec.Title), ql) || strings.Contains(strings.ToLower(rec.Text), ql) { results = append(results, rec) } //if len(results) >= 100 { // break //} } store.RUnlock() } Search(q, results, tags).Render(r.Context(), w) } func tagHandler(w http.ResponseWriter, r *http.Request) { q := strings.TrimSpace(r.URL.Query().Get("q")) results := []Recipe{} tags := map[string]int{} if q != "" { store.RLock() tags = store.tags for _, rec := range store.list { if slices.Contains(rec.Keywords, q) { results = append(results, rec) } } store.RUnlock() } TagPage(q, results, tags).Render(r.Context(), w) } func recipeHandler(w http.ResponseWriter, r *http.Request) { idStr := strings.TrimPrefix(r.URL.Path, "/recipe/") if idStr == "" { http.NotFound(w, r) return } var rec *Recipe store.RLock() for i := range store.list { if fmt.Sprint(store.list[i].ID) == idStr { rec = &store.list[i] break } } store.RUnlock() if rec == nil { http.NotFound(w, r) return } fmt.Println("Showing recipe", rec.ID, rec.Title) RecipePage(*rec).Render(r.Context(), w) } func loadParquet(path string) error { // Use parquet-go's generic ReadFile to load rows into Go structs. rows, err := parquet.ReadFile[RecipeRow](path) if err != nil { return err } if len(rows) == 0 { return nil } // Debug: print first 5 rows (URL, JsonLdPresent, JsonLd preview) max := 5 if len(rows) < max { max = len(rows) } log.Printf("Parquet rows: total=%d. Showing first %d rows:\n", len(rows), max) for i := 0; i < max; i++ { preview := rows[i].JsonLd if len(preview) > 200 { preview = preview[:200] + "..." } log.Printf("row %d: URL=%s JsonLdPresent=%v JsonLdPreview=%q\n", i+1, rows[i].URL, rows[i].JsonLdPresent, preview) } tmp := make([]Recipe, 0, len(rows)) tags := make(map[string]int) id := 1 for _, r := range rows { // only include rows where json_ld was present if !r.JsonLdPresent { continue } title := extractNameFromJsonLd(r.JsonLd) if title == "" { // fallback to URL when no name found title = r.URL } text := extractTextFromJsonLd(r.JsonLd) desc, ings, instr, author, image, keywords, aggregateRating, ratingCount := extractRecipeFields(r.JsonLd) for _, k := range keywords { tags[k]++ } tmp = append(tmp, Recipe{ID: id, URL: r.URL, Title: title, Text: text, Description: desc, Ingredients: ings, Instructions: instr, RawJsonLd: r.JsonLd, Author: author, ImageURL: image, Keywords: keywords, AggregateRating: aggregateRating, RatingCount: ratingCount}) id++ } // sort by title sort.Slice(tmp, func(i, j int) bool { return tmp[i].RatingCount > tmp[j].RatingCount }) store.Lock() store.list = tmp store.tags = tags store.Unlock() log.Printf("Loaded %d recipes from parquet", len(tmp)) return nil } // extractTextFromJsonLd parses json-ld string and picks common fields to make a searchable text blob. func extractTextFromJsonLd(s string) string { if s == "" { return "" } // Some JsonLd may be an array or an object, attempt to decode dec := json.NewDecoder(strings.NewReader(s)) var v interface{} if err := dec.Decode(&v); err != nil { // try gzip (sometimes compressed) if gz, err2 := tryGunzip([]byte(s)); err2 == nil { dec = json.NewDecoder(bytes.NewReader(gz)) if err3 := dec.Decode(&v); err3 != nil { return "" } } else { return "" } } // traverse to find Recipe objects var buf strings.Builder walkJSON(v, &buf) return buf.String() } func walkJSON(v interface{}, buf *strings.Builder) { switch t := v.(type) { case map[string]interface{}: // check @type or "@context" or "@graph" if tp, ok := t["@type"]; ok { if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") { // collect some fields for _, f := range []string{"name", "description", "recipeInstructions", "recipeIngredient"} { if val, ok := t[f]; ok { collectText(val, buf) buf.WriteString(" ") } } } } for _, v2 := range t { walkJSON(v2, buf) } case []interface{}: for _, e := range t { walkJSON(e, buf) } default: // ignore } } func collectText(v interface{}, buf *strings.Builder) { switch t := v.(type) { case string: buf.WriteString(t) case []interface{}: for _, e := range t { collectText(e, buf) buf.WriteString(";") } case map[string]interface{}: if name, ok := t["text"]; ok { if s, ok2 := name.(string); ok2 { buf.WriteString(s) } } } } func tryGunzip(b []byte) ([]byte, error) { r, err := gzip.NewReader(bytes.NewReader(b)) if err != nil { return nil, err } defer r.Close() return io.ReadAll(r) } // extractNameFromJsonLd decodes the JsonLd string and returns the first // recipe "name" it finds, or empty string if none. func extractNameFromJsonLd(s string) string { if s == "" { return "" } dec := json.NewDecoder(strings.NewReader(s)) var v interface{} if err := dec.Decode(&v); err != nil { if gz, err2 := tryGunzip([]byte(s)); err2 == nil { dec = json.NewDecoder(bytes.NewReader(gz)) if err3 := dec.Decode(&v); err3 != nil { return "" } } else { return "" } } // search for name field on recipe objects if name := findNameInJSON(v); name != "" { return name } return "" } // findNameInJSON traverses decoded JSON and returns the first "name" found // on an object whose @type contains "recipe". Returns empty string if none. func findNameInJSON(v interface{}) string { switch t := v.(type) { case map[string]interface{}: if tp, ok := t["@type"]; ok { if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") { if name, ok := t["name"]; ok { if ns, ok2 := name.(string); ok2 { return ns } } } } // also traverse children for _, v2 := range t { if found := findNameInJSON(v2); found != "" { return found } } case []interface{}: for _, e := range t { if found := findNameInJSON(e); found != "" { return found } } } return "" } // extractRecipeFields returns description, ingredients, instructions, author, // image URL, keywords, and aggregate rating parsed from JSON-LD. func extractRecipeFields(s string) (string, []string, []string, string, string, []string, float64, int) { if s == "" { return "", nil, nil, "", "", nil, 0, 0 } dec := json.NewDecoder(strings.NewReader(s)) var v interface{} if err := dec.Decode(&v); err != nil { if gz, err2 := tryGunzip([]byte(s)); err2 == nil { dec = json.NewDecoder(bytes.NewReader(gz)) if err3 := dec.Decode(&v); err3 != nil { return "", nil, nil, "", "", nil, 0, 0 } } else { return "", nil, nil, "", "", nil, 0, 0 } } // traverse to find recipe objects and collect fields var desc string var ings []string var instr []string var author string var image string var keywords []string var aggregateRating float64 var ratingCount int var walk func(interface{}) walk = func(node interface{}) { switch t := node.(type) { case map[string]interface{}: if tp, ok := t["@type"]; ok { if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") { // description if d, ok := t["description"]; ok { if ds, ok2 := d.(string); ok2 && desc == "" { desc = ds } } // ingredients if ing, ok := t["recipeIngredient"]; ok { switch it := ing.(type) { case string: ings = append(ings, it) case []interface{}: for _, e := range it { if s, ok := e.(string); ok { ings = append(ings, s) } } } } // instructions: may be text or array of objects with text if ins, ok := t["recipeInstructions"]; ok { switch it := ins.(type) { case string: instr = append(instr, it) case []interface{}: for _, step := range it { switch st := step.(type) { case string: instr = append(instr, st) case map[string]interface{}: if txt, ok := st["text"]; ok { if sText, ok2 := txt.(string); ok2 { instr = append(instr, sText) } } } } } } // author: can be string or object if a, ok := t["author"]; ok && author == "" { switch at := a.(type) { case string: author = at case map[string]interface{}: if n, ok := at["name"]; ok { if ns, ok2 := n.(string); ok2 { author = ns } } case []interface{}: // take first if len(at) > 0 { switch e := at[0].(type) { case string: author = e case map[string]interface{}: if n, ok := e["name"]; ok { if ns, ok2 := n.(string); ok2 { author = ns } } } } } } // image: can be string, object with url, or array if im, ok := t["image"]; ok && image == "" { switch it := im.(type) { case string: image = it case map[string]interface{}: if u, ok := it["url"]; ok { if us, ok2 := u.(string); ok2 { image = us } } else if id, ok := it["@id"]; ok { if ids, ok2 := id.(string); ok2 { image = ids } } case []interface{}: if len(it) > 0 { switch e := it[0].(type) { case string: image = e case map[string]interface{}: if u, ok := e["url"]; ok { if us, ok2 := u.(string); ok2 { image = us } } } } } } // keywords: can be string comma-separated or array if kw, ok := t["keywords"]; ok && len(keywords) == 0 { switch kt := kw.(type) { case string: for _, part := range strings.Split(kt, ",") { v := strings.TrimSpace(part) if v != "" { keywords = append(keywords, v) } } case []interface{}: for _, e := range kt { if s, ok := e.(string); ok { keywords = append(keywords, strings.TrimSpace(s)) } } } } // aggregateRating: can be object with ratingValue if ar, ok := t["aggregateRating"]; ok && aggregateRating == 0 { switch art := ar.(type) { case map[string]interface{}: if rv, ok := art["ratingValue"]; ok { switch rvt := rv.(type) { case float64: aggregateRating = rvt case string: if parsed, err := fmt.Sscanf(rvt, "%f", &aggregateRating); err == nil && parsed == 1 { // successfully parsed } } } if rc, ok := art["ratingCount"]; ok { switch rct := rc.(type) { case float64: ratingCount = int(rct) case int: ratingCount = rct case string: if parsed, err := fmt.Sscanf(rct, "%d", &ratingCount); err == nil && parsed == 1 { // successfully parsed } } } } } } } for _, c := range t { walk(c) } case []interface{}: for _, e := range t { walk(e) } } } walk(v) return desc, ings, instr, author, image, keywords, aggregateRating, ratingCount } // fetchRecipesIfMissing downloads the parquet file from Hugging Face if it's // not already present. It supports an optional HF_TOKEN environment variable // for private access. func fetchRecipesIfMissing(dest string) error { if _, err := os.Stat(dest); err == nil { return nil // already exists } url := os.Getenv("HF_DATASET_URL") if url == "" { url = datasetURL } // support HF token for private datasets token := os.Getenv("HF_TOKEN") // create directory if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { return err } client := &http.Client{Timeout: 60 * time.Second} var lastErr error for attempt := 1; attempt <= 3; attempt++ { req, _ := http.NewRequestWithContext(context.Background(), "GET", url, nil) if token != "" { req.Header.Set("Authorization", "Bearer "+token) } resp, err := client.Do(req) if err != nil { lastErr = err time.Sleep(time.Duration(attempt) * time.Second) continue } if resp.StatusCode != 200 { body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) resp.Body.Close() lastErr = fmt.Errorf("bad status %d: %s", resp.StatusCode, string(body)) time.Sleep(time.Duration(attempt) * time.Second) continue } // stream to file out, err := os.Create(dest) if err != nil { resp.Body.Close() return err } _, err = io.Copy(out, resp.Body) resp.Body.Close() out.Close() if err != nil { lastErr = err os.Remove(dest) time.Sleep(time.Duration(attempt) * time.Second) continue } return nil } return lastErr }