Wissotsky's picture
Recipe Viewer
94375a9
package main
import (
"bytes"
"compress/gzip"
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"path/filepath"
"slices"
"sort"
"strings"
"sync"
"time"
"github.com/parquet-go/parquet-go"
)
// RecipeRow holds all the scraped info for a single URL
type RecipeRow struct {
URL string `parquet:"url"`
Title string `parquet:"title"`
JsonLd string `parquet:"json_ld,zstd"`
Html string `parquet:"html,zstd"`
Sitemap string `parquet:"sitemap"`
ScrapeTimestamp int64 `parquet:"scrape_timestamp"`
JsonLdPresent bool `parquet:"json_ld_present"`
HtmlRecipePresent bool `parquet:"html_recipe_present"`
HttpStatusCode int `parquet:"http_status_code"`
}
// Minimal extracted recipe
type Recipe struct {
ID int `json:"id"`
URL string `json:"url"`
Title string `json:"title"`
Text string `json:"text"` // combined description + instructions (searchable)
Description string `json:"description"`
Ingredients []string `json:"ingredients"`
Instructions []string `json:"instructions"`
RawJsonLd string `json:"raw_json_ld"`
Author string `json:"author"`
ImageURL string `json:"image_url"`
Keywords []string `json:"keywords"`
AggregateRating float64 `json:"aggregate_rating"`
RatingCount int `json:"rating_count"`
}
var (
store = struct {
sync.RWMutex
list []Recipe
tags map[string]int
}{}
)
// datasetURL points to the public parquet file on Hugging Face. It can be
// overridden by setting the HF_DATASET_URL environment variable.
var datasetURL = "https://huggingface.co/datasets/Wissotsky/HebrewRecipes/resolve/main/recipes.parquet"
func main() {
// ensure data dir exists
os.MkdirAll("data", 0755)
// Ensure parquet file is available; attempt download if missing.
parquetPath := filepath.Join("data", "recipes.parquet")
if err := fetchRecipesIfMissing(parquetPath); err != nil {
log.Printf("warning: could not ensure parquet file: %v", err)
}
// Try loading regardless (may be missing if download failed)
if err := loadParquet(parquetPath); err != nil {
log.Printf("failed loading parquet: %v", err)
}
mux := http.NewServeMux()
mux.HandleFunc("/", indexHandler)
mux.HandleFunc("/tag", tagHandler)
mux.HandleFunc("/search", searchHandler)
mux.HandleFunc("/recipe/", recipeHandler)
mux.Handle("/static/", http.StripPrefix("/static/", http.FileServer(http.Dir("static"))))
addr := ":8080"
log.Printf("Starting server on %s", addr)
log.Fatal(http.ListenAndServe(addr, mux))
}
func indexHandler(w http.ResponseWriter, r *http.Request) {
// show top N recipes
store.RLock()
list := store.list
tags := store.tags
store.RUnlock()
//if len(list) > 50 {
// list = list[:50]
//}
component := Index(list, tags)
component.Render(r.Context(), w)
}
func searchHandler(w http.ResponseWriter, r *http.Request) {
q := strings.TrimSpace(r.URL.Query().Get("q"))
results := []Recipe{}
tags := map[string]int{}
if q != "" {
ql := strings.ToLower(q)
store.RLock()
tags = store.tags
for _, rec := range store.list {
if strings.Contains(strings.ToLower(rec.Title), ql) || strings.Contains(strings.ToLower(rec.Text), ql) {
results = append(results, rec)
}
//if len(results) >= 100 {
// break
//}
}
store.RUnlock()
}
Search(q, results, tags).Render(r.Context(), w)
}
func tagHandler(w http.ResponseWriter, r *http.Request) {
q := strings.TrimSpace(r.URL.Query().Get("q"))
results := []Recipe{}
tags := map[string]int{}
if q != "" {
store.RLock()
tags = store.tags
for _, rec := range store.list {
if slices.Contains(rec.Keywords, q) {
results = append(results, rec)
}
}
store.RUnlock()
}
TagPage(q, results, tags).Render(r.Context(), w)
}
func recipeHandler(w http.ResponseWriter, r *http.Request) {
idStr := strings.TrimPrefix(r.URL.Path, "/recipe/")
if idStr == "" {
http.NotFound(w, r)
return
}
var rec *Recipe
store.RLock()
for i := range store.list {
if fmt.Sprint(store.list[i].ID) == idStr {
rec = &store.list[i]
break
}
}
store.RUnlock()
if rec == nil {
http.NotFound(w, r)
return
}
fmt.Println("Showing recipe", rec.ID, rec.Title)
RecipePage(*rec).Render(r.Context(), w)
}
func loadParquet(path string) error {
// Use parquet-go's generic ReadFile to load rows into Go structs.
rows, err := parquet.ReadFile[RecipeRow](path)
if err != nil {
return err
}
if len(rows) == 0 {
return nil
}
// Debug: print first 5 rows (URL, JsonLdPresent, JsonLd preview)
max := 5
if len(rows) < max {
max = len(rows)
}
log.Printf("Parquet rows: total=%d. Showing first %d rows:\n", len(rows), max)
for i := 0; i < max; i++ {
preview := rows[i].JsonLd
if len(preview) > 200 {
preview = preview[:200] + "..."
}
log.Printf("row %d: URL=%s JsonLdPresent=%v JsonLdPreview=%q\n", i+1, rows[i].URL, rows[i].JsonLdPresent, preview)
}
tmp := make([]Recipe, 0, len(rows))
tags := make(map[string]int)
id := 1
for _, r := range rows {
// only include rows where json_ld was present
if !r.JsonLdPresent {
continue
}
title := extractNameFromJsonLd(r.JsonLd)
if title == "" {
// fallback to URL when no name found
title = r.URL
}
text := extractTextFromJsonLd(r.JsonLd)
desc, ings, instr, author, image, keywords, aggregateRating, ratingCount := extractRecipeFields(r.JsonLd)
for _, k := range keywords {
tags[k]++
}
tmp = append(tmp, Recipe{ID: id, URL: r.URL, Title: title, Text: text, Description: desc, Ingredients: ings, Instructions: instr, RawJsonLd: r.JsonLd, Author: author, ImageURL: image, Keywords: keywords, AggregateRating: aggregateRating, RatingCount: ratingCount})
id++
}
// sort by title
sort.Slice(tmp, func(i, j int) bool { return tmp[i].RatingCount > tmp[j].RatingCount })
store.Lock()
store.list = tmp
store.tags = tags
store.Unlock()
log.Printf("Loaded %d recipes from parquet", len(tmp))
return nil
}
// extractTextFromJsonLd parses json-ld string and picks common fields to make a searchable text blob.
func extractTextFromJsonLd(s string) string {
if s == "" {
return ""
}
// Some JsonLd may be an array or an object, attempt to decode
dec := json.NewDecoder(strings.NewReader(s))
var v interface{}
if err := dec.Decode(&v); err != nil {
// try gzip (sometimes compressed)
if gz, err2 := tryGunzip([]byte(s)); err2 == nil {
dec = json.NewDecoder(bytes.NewReader(gz))
if err3 := dec.Decode(&v); err3 != nil {
return ""
}
} else {
return ""
}
}
// traverse to find Recipe objects
var buf strings.Builder
walkJSON(v, &buf)
return buf.String()
}
func walkJSON(v interface{}, buf *strings.Builder) {
switch t := v.(type) {
case map[string]interface{}:
// check @type or "@context" or "@graph"
if tp, ok := t["@type"]; ok {
if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") {
// collect some fields
for _, f := range []string{"name", "description", "recipeInstructions", "recipeIngredient"} {
if val, ok := t[f]; ok {
collectText(val, buf)
buf.WriteString(" ")
}
}
}
}
for _, v2 := range t {
walkJSON(v2, buf)
}
case []interface{}:
for _, e := range t {
walkJSON(e, buf)
}
default:
// ignore
}
}
func collectText(v interface{}, buf *strings.Builder) {
switch t := v.(type) {
case string:
buf.WriteString(t)
case []interface{}:
for _, e := range t {
collectText(e, buf)
buf.WriteString(";")
}
case map[string]interface{}:
if name, ok := t["text"]; ok {
if s, ok2 := name.(string); ok2 {
buf.WriteString(s)
}
}
}
}
func tryGunzip(b []byte) ([]byte, error) {
r, err := gzip.NewReader(bytes.NewReader(b))
if err != nil {
return nil, err
}
defer r.Close()
return io.ReadAll(r)
}
// extractNameFromJsonLd decodes the JsonLd string and returns the first
// recipe "name" it finds, or empty string if none.
func extractNameFromJsonLd(s string) string {
if s == "" {
return ""
}
dec := json.NewDecoder(strings.NewReader(s))
var v interface{}
if err := dec.Decode(&v); err != nil {
if gz, err2 := tryGunzip([]byte(s)); err2 == nil {
dec = json.NewDecoder(bytes.NewReader(gz))
if err3 := dec.Decode(&v); err3 != nil {
return ""
}
} else {
return ""
}
}
// search for name field on recipe objects
if name := findNameInJSON(v); name != "" {
return name
}
return ""
}
// findNameInJSON traverses decoded JSON and returns the first "name" found
// on an object whose @type contains "recipe". Returns empty string if none.
func findNameInJSON(v interface{}) string {
switch t := v.(type) {
case map[string]interface{}:
if tp, ok := t["@type"]; ok {
if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") {
if name, ok := t["name"]; ok {
if ns, ok2 := name.(string); ok2 {
return ns
}
}
}
}
// also traverse children
for _, v2 := range t {
if found := findNameInJSON(v2); found != "" {
return found
}
}
case []interface{}:
for _, e := range t {
if found := findNameInJSON(e); found != "" {
return found
}
}
}
return ""
}
// extractRecipeFields returns description, ingredients, instructions, author,
// image URL, keywords, and aggregate rating parsed from JSON-LD.
func extractRecipeFields(s string) (string, []string, []string, string, string, []string, float64, int) {
if s == "" {
return "", nil, nil, "", "", nil, 0, 0
}
dec := json.NewDecoder(strings.NewReader(s))
var v interface{}
if err := dec.Decode(&v); err != nil {
if gz, err2 := tryGunzip([]byte(s)); err2 == nil {
dec = json.NewDecoder(bytes.NewReader(gz))
if err3 := dec.Decode(&v); err3 != nil {
return "", nil, nil, "", "", nil, 0, 0
}
} else {
return "", nil, nil, "", "", nil, 0, 0
}
}
// traverse to find recipe objects and collect fields
var desc string
var ings []string
var instr []string
var author string
var image string
var keywords []string
var aggregateRating float64
var ratingCount int
var walk func(interface{})
walk = func(node interface{}) {
switch t := node.(type) {
case map[string]interface{}:
if tp, ok := t["@type"]; ok {
if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") {
// description
if d, ok := t["description"]; ok {
if ds, ok2 := d.(string); ok2 && desc == "" {
desc = ds
}
}
// ingredients
if ing, ok := t["recipeIngredient"]; ok {
switch it := ing.(type) {
case string:
ings = append(ings, it)
case []interface{}:
for _, e := range it {
if s, ok := e.(string); ok {
ings = append(ings, s)
}
}
}
}
// instructions: may be text or array of objects with text
if ins, ok := t["recipeInstructions"]; ok {
switch it := ins.(type) {
case string:
instr = append(instr, it)
case []interface{}:
for _, step := range it {
switch st := step.(type) {
case string:
instr = append(instr, st)
case map[string]interface{}:
if txt, ok := st["text"]; ok {
if sText, ok2 := txt.(string); ok2 {
instr = append(instr, sText)
}
}
}
}
}
}
// author: can be string or object
if a, ok := t["author"]; ok && author == "" {
switch at := a.(type) {
case string:
author = at
case map[string]interface{}:
if n, ok := at["name"]; ok {
if ns, ok2 := n.(string); ok2 {
author = ns
}
}
case []interface{}:
// take first
if len(at) > 0 {
switch e := at[0].(type) {
case string:
author = e
case map[string]interface{}:
if n, ok := e["name"]; ok {
if ns, ok2 := n.(string); ok2 {
author = ns
}
}
}
}
}
}
// image: can be string, object with url, or array
if im, ok := t["image"]; ok && image == "" {
switch it := im.(type) {
case string:
image = it
case map[string]interface{}:
if u, ok := it["url"]; ok {
if us, ok2 := u.(string); ok2 {
image = us
}
} else if id, ok := it["@id"]; ok {
if ids, ok2 := id.(string); ok2 {
image = ids
}
}
case []interface{}:
if len(it) > 0 {
switch e := it[0].(type) {
case string:
image = e
case map[string]interface{}:
if u, ok := e["url"]; ok {
if us, ok2 := u.(string); ok2 {
image = us
}
}
}
}
}
}
// keywords: can be string comma-separated or array
if kw, ok := t["keywords"]; ok && len(keywords) == 0 {
switch kt := kw.(type) {
case string:
for _, part := range strings.Split(kt, ",") {
v := strings.TrimSpace(part)
if v != "" {
keywords = append(keywords, v)
}
}
case []interface{}:
for _, e := range kt {
if s, ok := e.(string); ok {
keywords = append(keywords, strings.TrimSpace(s))
}
}
}
}
// aggregateRating: can be object with ratingValue
if ar, ok := t["aggregateRating"]; ok && aggregateRating == 0 {
switch art := ar.(type) {
case map[string]interface{}:
if rv, ok := art["ratingValue"]; ok {
switch rvt := rv.(type) {
case float64:
aggregateRating = rvt
case string:
if parsed, err := fmt.Sscanf(rvt, "%f", &aggregateRating); err == nil && parsed == 1 {
// successfully parsed
}
}
}
if rc, ok := art["ratingCount"]; ok {
switch rct := rc.(type) {
case float64:
ratingCount = int(rct)
case int:
ratingCount = rct
case string:
if parsed, err := fmt.Sscanf(rct, "%d", &ratingCount); err == nil && parsed == 1 {
// successfully parsed
}
}
}
}
}
}
}
for _, c := range t {
walk(c)
}
case []interface{}:
for _, e := range t {
walk(e)
}
}
}
walk(v)
return desc, ings, instr, author, image, keywords, aggregateRating, ratingCount
}
// fetchRecipesIfMissing downloads the parquet file from Hugging Face if it's
// not already present. It supports an optional HF_TOKEN environment variable
// for private access.
func fetchRecipesIfMissing(dest string) error {
if _, err := os.Stat(dest); err == nil {
return nil // already exists
}
url := os.Getenv("HF_DATASET_URL")
if url == "" {
url = datasetURL
}
// support HF token for private datasets
token := os.Getenv("HF_TOKEN")
// create directory
if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
return err
}
client := &http.Client{Timeout: 60 * time.Second}
var lastErr error
for attempt := 1; attempt <= 3; attempt++ {
req, _ := http.NewRequestWithContext(context.Background(), "GET", url, nil)
if token != "" {
req.Header.Set("Authorization", "Bearer "+token)
}
resp, err := client.Do(req)
if err != nil {
lastErr = err
time.Sleep(time.Duration(attempt) * time.Second)
continue
}
if resp.StatusCode != 200 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
resp.Body.Close()
lastErr = fmt.Errorf("bad status %d: %s", resp.StatusCode, string(body))
time.Sleep(time.Duration(attempt) * time.Second)
continue
}
// stream to file
out, err := os.Create(dest)
if err != nil {
resp.Body.Close()
return err
}
_, err = io.Copy(out, resp.Body)
resp.Body.Close()
out.Close()
if err != nil {
lastErr = err
os.Remove(dest)
time.Sleep(time.Duration(attempt) * time.Second)
continue
}
return nil
}
return lastErr
}