|
|
package main |
|
|
|
|
|
import ( |
|
|
"bytes" |
|
|
"compress/gzip" |
|
|
"context" |
|
|
"encoding/json" |
|
|
"fmt" |
|
|
"io" |
|
|
"log" |
|
|
"net/http" |
|
|
"os" |
|
|
"path/filepath" |
|
|
"slices" |
|
|
"sort" |
|
|
"strings" |
|
|
"sync" |
|
|
"time" |
|
|
|
|
|
"github.com/parquet-go/parquet-go" |
|
|
) |
|
|
|
|
|
|
|
|
type RecipeRow struct { |
|
|
URL string `parquet:"url"` |
|
|
Title string `parquet:"title"` |
|
|
JsonLd string `parquet:"json_ld,zstd"` |
|
|
Html string `parquet:"html,zstd"` |
|
|
Sitemap string `parquet:"sitemap"` |
|
|
ScrapeTimestamp int64 `parquet:"scrape_timestamp"` |
|
|
JsonLdPresent bool `parquet:"json_ld_present"` |
|
|
HtmlRecipePresent bool `parquet:"html_recipe_present"` |
|
|
HttpStatusCode int `parquet:"http_status_code"` |
|
|
} |
|
|
|
|
|
|
|
|
type Recipe struct { |
|
|
ID int `json:"id"` |
|
|
URL string `json:"url"` |
|
|
Title string `json:"title"` |
|
|
Text string `json:"text"` |
|
|
Description string `json:"description"` |
|
|
Ingredients []string `json:"ingredients"` |
|
|
Instructions []string `json:"instructions"` |
|
|
RawJsonLd string `json:"raw_json_ld"` |
|
|
Author string `json:"author"` |
|
|
ImageURL string `json:"image_url"` |
|
|
Keywords []string `json:"keywords"` |
|
|
AggregateRating float64 `json:"aggregate_rating"` |
|
|
RatingCount int `json:"rating_count"` |
|
|
} |
|
|
|
|
|
var ( |
|
|
store = struct { |
|
|
sync.RWMutex |
|
|
list []Recipe |
|
|
tags map[string]int |
|
|
}{} |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
var datasetURL = "https://huggingface.co/datasets/Wissotsky/HebrewRecipes/resolve/main/recipes.parquet" |
|
|
|
|
|
func main() { |
|
|
|
|
|
os.MkdirAll("data", 0755) |
|
|
|
|
|
|
|
|
parquetPath := filepath.Join("data", "recipes.parquet") |
|
|
if err := fetchRecipesIfMissing(parquetPath); err != nil { |
|
|
log.Printf("warning: could not ensure parquet file: %v", err) |
|
|
} |
|
|
|
|
|
if err := loadParquet(parquetPath); err != nil { |
|
|
log.Printf("failed loading parquet: %v", err) |
|
|
} |
|
|
|
|
|
mux := http.NewServeMux() |
|
|
mux.HandleFunc("/", indexHandler) |
|
|
mux.HandleFunc("/tag", tagHandler) |
|
|
mux.HandleFunc("/search", searchHandler) |
|
|
mux.HandleFunc("/recipe/", recipeHandler) |
|
|
mux.Handle("/static/", http.StripPrefix("/static/", http.FileServer(http.Dir("static")))) |
|
|
|
|
|
addr := ":8080" |
|
|
log.Printf("Starting server on %s", addr) |
|
|
log.Fatal(http.ListenAndServe(addr, mux)) |
|
|
} |
|
|
|
|
|
func indexHandler(w http.ResponseWriter, r *http.Request) { |
|
|
|
|
|
store.RLock() |
|
|
list := store.list |
|
|
tags := store.tags |
|
|
store.RUnlock() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
component := Index(list, tags) |
|
|
component.Render(r.Context(), w) |
|
|
} |
|
|
|
|
|
func searchHandler(w http.ResponseWriter, r *http.Request) { |
|
|
q := strings.TrimSpace(r.URL.Query().Get("q")) |
|
|
results := []Recipe{} |
|
|
tags := map[string]int{} |
|
|
if q != "" { |
|
|
ql := strings.ToLower(q) |
|
|
store.RLock() |
|
|
tags = store.tags |
|
|
for _, rec := range store.list { |
|
|
if strings.Contains(strings.ToLower(rec.Title), ql) || strings.Contains(strings.ToLower(rec.Text), ql) { |
|
|
results = append(results, rec) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
store.RUnlock() |
|
|
} |
|
|
|
|
|
Search(q, results, tags).Render(r.Context(), w) |
|
|
} |
|
|
|
|
|
func tagHandler(w http.ResponseWriter, r *http.Request) { |
|
|
q := strings.TrimSpace(r.URL.Query().Get("q")) |
|
|
results := []Recipe{} |
|
|
tags := map[string]int{} |
|
|
if q != "" { |
|
|
store.RLock() |
|
|
tags = store.tags |
|
|
for _, rec := range store.list { |
|
|
if slices.Contains(rec.Keywords, q) { |
|
|
results = append(results, rec) |
|
|
} |
|
|
} |
|
|
store.RUnlock() |
|
|
} |
|
|
|
|
|
TagPage(q, results, tags).Render(r.Context(), w) |
|
|
} |
|
|
func recipeHandler(w http.ResponseWriter, r *http.Request) { |
|
|
idStr := strings.TrimPrefix(r.URL.Path, "/recipe/") |
|
|
if idStr == "" { |
|
|
http.NotFound(w, r) |
|
|
return |
|
|
} |
|
|
var rec *Recipe |
|
|
store.RLock() |
|
|
for i := range store.list { |
|
|
if fmt.Sprint(store.list[i].ID) == idStr { |
|
|
rec = &store.list[i] |
|
|
break |
|
|
} |
|
|
} |
|
|
store.RUnlock() |
|
|
if rec == nil { |
|
|
http.NotFound(w, r) |
|
|
return |
|
|
} |
|
|
fmt.Println("Showing recipe", rec.ID, rec.Title) |
|
|
RecipePage(*rec).Render(r.Context(), w) |
|
|
} |
|
|
|
|
|
func loadParquet(path string) error { |
|
|
|
|
|
rows, err := parquet.ReadFile[RecipeRow](path) |
|
|
if err != nil { |
|
|
return err |
|
|
} |
|
|
if len(rows) == 0 { |
|
|
return nil |
|
|
} |
|
|
|
|
|
|
|
|
max := 5 |
|
|
if len(rows) < max { |
|
|
max = len(rows) |
|
|
} |
|
|
log.Printf("Parquet rows: total=%d. Showing first %d rows:\n", len(rows), max) |
|
|
for i := 0; i < max; i++ { |
|
|
preview := rows[i].JsonLd |
|
|
if len(preview) > 200 { |
|
|
preview = preview[:200] + "..." |
|
|
} |
|
|
log.Printf("row %d: URL=%s JsonLdPresent=%v JsonLdPreview=%q\n", i+1, rows[i].URL, rows[i].JsonLdPresent, preview) |
|
|
} |
|
|
|
|
|
tmp := make([]Recipe, 0, len(rows)) |
|
|
tags := make(map[string]int) |
|
|
id := 1 |
|
|
for _, r := range rows { |
|
|
|
|
|
if !r.JsonLdPresent { |
|
|
continue |
|
|
} |
|
|
|
|
|
title := extractNameFromJsonLd(r.JsonLd) |
|
|
if title == "" { |
|
|
|
|
|
title = r.URL |
|
|
} |
|
|
text := extractTextFromJsonLd(r.JsonLd) |
|
|
desc, ings, instr, author, image, keywords, aggregateRating, ratingCount := extractRecipeFields(r.JsonLd) |
|
|
for _, k := range keywords { |
|
|
tags[k]++ |
|
|
} |
|
|
tmp = append(tmp, Recipe{ID: id, URL: r.URL, Title: title, Text: text, Description: desc, Ingredients: ings, Instructions: instr, RawJsonLd: r.JsonLd, Author: author, ImageURL: image, Keywords: keywords, AggregateRating: aggregateRating, RatingCount: ratingCount}) |
|
|
id++ |
|
|
} |
|
|
|
|
|
|
|
|
sort.Slice(tmp, func(i, j int) bool { return tmp[i].RatingCount > tmp[j].RatingCount }) |
|
|
|
|
|
store.Lock() |
|
|
store.list = tmp |
|
|
store.tags = tags |
|
|
store.Unlock() |
|
|
log.Printf("Loaded %d recipes from parquet", len(tmp)) |
|
|
return nil |
|
|
} |
|
|
|
|
|
|
|
|
func extractTextFromJsonLd(s string) string { |
|
|
if s == "" { |
|
|
return "" |
|
|
} |
|
|
|
|
|
dec := json.NewDecoder(strings.NewReader(s)) |
|
|
var v interface{} |
|
|
if err := dec.Decode(&v); err != nil { |
|
|
|
|
|
if gz, err2 := tryGunzip([]byte(s)); err2 == nil { |
|
|
dec = json.NewDecoder(bytes.NewReader(gz)) |
|
|
if err3 := dec.Decode(&v); err3 != nil { |
|
|
return "" |
|
|
} |
|
|
} else { |
|
|
return "" |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
var buf strings.Builder |
|
|
walkJSON(v, &buf) |
|
|
return buf.String() |
|
|
} |
|
|
|
|
|
func walkJSON(v interface{}, buf *strings.Builder) { |
|
|
switch t := v.(type) { |
|
|
case map[string]interface{}: |
|
|
|
|
|
if tp, ok := t["@type"]; ok { |
|
|
if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") { |
|
|
|
|
|
for _, f := range []string{"name", "description", "recipeInstructions", "recipeIngredient"} { |
|
|
if val, ok := t[f]; ok { |
|
|
collectText(val, buf) |
|
|
buf.WriteString(" ") |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
for _, v2 := range t { |
|
|
walkJSON(v2, buf) |
|
|
} |
|
|
case []interface{}: |
|
|
for _, e := range t { |
|
|
walkJSON(e, buf) |
|
|
} |
|
|
default: |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
func collectText(v interface{}, buf *strings.Builder) { |
|
|
switch t := v.(type) { |
|
|
case string: |
|
|
buf.WriteString(t) |
|
|
case []interface{}: |
|
|
for _, e := range t { |
|
|
collectText(e, buf) |
|
|
buf.WriteString(";") |
|
|
} |
|
|
case map[string]interface{}: |
|
|
if name, ok := t["text"]; ok { |
|
|
if s, ok2 := name.(string); ok2 { |
|
|
buf.WriteString(s) |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
func tryGunzip(b []byte) ([]byte, error) { |
|
|
r, err := gzip.NewReader(bytes.NewReader(b)) |
|
|
if err != nil { |
|
|
return nil, err |
|
|
} |
|
|
defer r.Close() |
|
|
return io.ReadAll(r) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
func extractNameFromJsonLd(s string) string { |
|
|
if s == "" { |
|
|
return "" |
|
|
} |
|
|
dec := json.NewDecoder(strings.NewReader(s)) |
|
|
var v interface{} |
|
|
if err := dec.Decode(&v); err != nil { |
|
|
if gz, err2 := tryGunzip([]byte(s)); err2 == nil { |
|
|
dec = json.NewDecoder(bytes.NewReader(gz)) |
|
|
if err3 := dec.Decode(&v); err3 != nil { |
|
|
return "" |
|
|
} |
|
|
} else { |
|
|
return "" |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if name := findNameInJSON(v); name != "" { |
|
|
return name |
|
|
} |
|
|
return "" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
func findNameInJSON(v interface{}) string { |
|
|
switch t := v.(type) { |
|
|
case map[string]interface{}: |
|
|
if tp, ok := t["@type"]; ok { |
|
|
if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") { |
|
|
if name, ok := t["name"]; ok { |
|
|
if ns, ok2 := name.(string); ok2 { |
|
|
return ns |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
for _, v2 := range t { |
|
|
if found := findNameInJSON(v2); found != "" { |
|
|
return found |
|
|
} |
|
|
} |
|
|
case []interface{}: |
|
|
for _, e := range t { |
|
|
if found := findNameInJSON(e); found != "" { |
|
|
return found |
|
|
} |
|
|
} |
|
|
} |
|
|
return "" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
func extractRecipeFields(s string) (string, []string, []string, string, string, []string, float64, int) { |
|
|
if s == "" { |
|
|
return "", nil, nil, "", "", nil, 0, 0 |
|
|
} |
|
|
dec := json.NewDecoder(strings.NewReader(s)) |
|
|
var v interface{} |
|
|
if err := dec.Decode(&v); err != nil { |
|
|
if gz, err2 := tryGunzip([]byte(s)); err2 == nil { |
|
|
dec = json.NewDecoder(bytes.NewReader(gz)) |
|
|
if err3 := dec.Decode(&v); err3 != nil { |
|
|
return "", nil, nil, "", "", nil, 0, 0 |
|
|
} |
|
|
} else { |
|
|
return "", nil, nil, "", "", nil, 0, 0 |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
var desc string |
|
|
var ings []string |
|
|
var instr []string |
|
|
var author string |
|
|
var image string |
|
|
var keywords []string |
|
|
var aggregateRating float64 |
|
|
var ratingCount int |
|
|
|
|
|
var walk func(interface{}) |
|
|
walk = func(node interface{}) { |
|
|
switch t := node.(type) { |
|
|
case map[string]interface{}: |
|
|
if tp, ok := t["@type"]; ok { |
|
|
if s, ok2 := tp.(string); ok2 && strings.Contains(strings.ToLower(s), "recipe") { |
|
|
|
|
|
if d, ok := t["description"]; ok { |
|
|
if ds, ok2 := d.(string); ok2 && desc == "" { |
|
|
desc = ds |
|
|
} |
|
|
} |
|
|
|
|
|
if ing, ok := t["recipeIngredient"]; ok { |
|
|
switch it := ing.(type) { |
|
|
case string: |
|
|
ings = append(ings, it) |
|
|
case []interface{}: |
|
|
for _, e := range it { |
|
|
if s, ok := e.(string); ok { |
|
|
ings = append(ings, s) |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
if ins, ok := t["recipeInstructions"]; ok { |
|
|
switch it := ins.(type) { |
|
|
case string: |
|
|
instr = append(instr, it) |
|
|
case []interface{}: |
|
|
for _, step := range it { |
|
|
switch st := step.(type) { |
|
|
case string: |
|
|
instr = append(instr, st) |
|
|
case map[string]interface{}: |
|
|
if txt, ok := st["text"]; ok { |
|
|
if sText, ok2 := txt.(string); ok2 { |
|
|
instr = append(instr, sText) |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
if a, ok := t["author"]; ok && author == "" { |
|
|
switch at := a.(type) { |
|
|
case string: |
|
|
author = at |
|
|
case map[string]interface{}: |
|
|
if n, ok := at["name"]; ok { |
|
|
if ns, ok2 := n.(string); ok2 { |
|
|
author = ns |
|
|
} |
|
|
} |
|
|
case []interface{}: |
|
|
|
|
|
if len(at) > 0 { |
|
|
switch e := at[0].(type) { |
|
|
case string: |
|
|
author = e |
|
|
case map[string]interface{}: |
|
|
if n, ok := e["name"]; ok { |
|
|
if ns, ok2 := n.(string); ok2 { |
|
|
author = ns |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
if im, ok := t["image"]; ok && image == "" { |
|
|
switch it := im.(type) { |
|
|
case string: |
|
|
image = it |
|
|
case map[string]interface{}: |
|
|
if u, ok := it["url"]; ok { |
|
|
if us, ok2 := u.(string); ok2 { |
|
|
image = us |
|
|
} |
|
|
} else if id, ok := it["@id"]; ok { |
|
|
if ids, ok2 := id.(string); ok2 { |
|
|
image = ids |
|
|
} |
|
|
} |
|
|
case []interface{}: |
|
|
if len(it) > 0 { |
|
|
switch e := it[0].(type) { |
|
|
case string: |
|
|
image = e |
|
|
case map[string]interface{}: |
|
|
if u, ok := e["url"]; ok { |
|
|
if us, ok2 := u.(string); ok2 { |
|
|
image = us |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
if kw, ok := t["keywords"]; ok && len(keywords) == 0 { |
|
|
switch kt := kw.(type) { |
|
|
case string: |
|
|
for _, part := range strings.Split(kt, ",") { |
|
|
v := strings.TrimSpace(part) |
|
|
if v != "" { |
|
|
keywords = append(keywords, v) |
|
|
} |
|
|
} |
|
|
case []interface{}: |
|
|
for _, e := range kt { |
|
|
if s, ok := e.(string); ok { |
|
|
keywords = append(keywords, strings.TrimSpace(s)) |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
if ar, ok := t["aggregateRating"]; ok && aggregateRating == 0 { |
|
|
switch art := ar.(type) { |
|
|
case map[string]interface{}: |
|
|
if rv, ok := art["ratingValue"]; ok { |
|
|
switch rvt := rv.(type) { |
|
|
case float64: |
|
|
aggregateRating = rvt |
|
|
case string: |
|
|
if parsed, err := fmt.Sscanf(rvt, "%f", &aggregateRating); err == nil && parsed == 1 { |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
if rc, ok := art["ratingCount"]; ok { |
|
|
switch rct := rc.(type) { |
|
|
case float64: |
|
|
ratingCount = int(rct) |
|
|
case int: |
|
|
ratingCount = rct |
|
|
case string: |
|
|
if parsed, err := fmt.Sscanf(rct, "%d", &ratingCount); err == nil && parsed == 1 { |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
for _, c := range t { |
|
|
walk(c) |
|
|
} |
|
|
case []interface{}: |
|
|
for _, e := range t { |
|
|
walk(e) |
|
|
} |
|
|
} |
|
|
} |
|
|
walk(v) |
|
|
return desc, ings, instr, author, image, keywords, aggregateRating, ratingCount |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
func fetchRecipesIfMissing(dest string) error { |
|
|
if _, err := os.Stat(dest); err == nil { |
|
|
return nil |
|
|
} |
|
|
|
|
|
url := os.Getenv("HF_DATASET_URL") |
|
|
if url == "" { |
|
|
url = datasetURL |
|
|
} |
|
|
|
|
|
|
|
|
token := os.Getenv("HF_TOKEN") |
|
|
|
|
|
|
|
|
if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { |
|
|
return err |
|
|
} |
|
|
|
|
|
client := &http.Client{Timeout: 60 * time.Second} |
|
|
var lastErr error |
|
|
for attempt := 1; attempt <= 3; attempt++ { |
|
|
req, _ := http.NewRequestWithContext(context.Background(), "GET", url, nil) |
|
|
if token != "" { |
|
|
req.Header.Set("Authorization", "Bearer "+token) |
|
|
} |
|
|
|
|
|
resp, err := client.Do(req) |
|
|
if err != nil { |
|
|
lastErr = err |
|
|
time.Sleep(time.Duration(attempt) * time.Second) |
|
|
continue |
|
|
} |
|
|
if resp.StatusCode != 200 { |
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) |
|
|
resp.Body.Close() |
|
|
lastErr = fmt.Errorf("bad status %d: %s", resp.StatusCode, string(body)) |
|
|
time.Sleep(time.Duration(attempt) * time.Second) |
|
|
continue |
|
|
} |
|
|
|
|
|
|
|
|
out, err := os.Create(dest) |
|
|
if err != nil { |
|
|
resp.Body.Close() |
|
|
return err |
|
|
} |
|
|
_, err = io.Copy(out, resp.Body) |
|
|
resp.Body.Close() |
|
|
out.Close() |
|
|
if err != nil { |
|
|
lastErr = err |
|
|
os.Remove(dest) |
|
|
time.Sleep(time.Duration(attempt) * time.Second) |
|
|
continue |
|
|
} |
|
|
return nil |
|
|
} |
|
|
return lastErr |
|
|
} |
|
|
|