Alex Latipov commited on
Commit
86083ec
·
1 Parent(s): ac1fd0a

Expose raw DBpedia and corporate SPARQL proxy endpoints

Browse files
deployment/hf_eval_backend/README.md CHANGED
@@ -1,35 +1,46 @@
1
- # HF Evaluation Backend
2
 
3
- This package is the frozen evaluation backend for the Text2SPARQL challenge.
4
 
5
- Frozen configuration:
6
- - model: `gpt-5-mini`
7
- - judge mode: `single_general`
8
- - linker: `internal_max`
9
- - syntax attempts: `5`
10
- - semantic attempts: `5`
11
- - repair prompt: `local_edit`
12
 
13
- Main files:
14
- - `configs/final_gpt5mini_single_general_5_5_internal_max_local_edit.yaml`
15
- - `app.py`
16
- - `start_backend.sh`
17
- - `Dockerfile`
18
 
19
- API:
20
  - `GET /health`
21
- - `GET /text2sparql?question=...&dataset=...` -> returns plain SPARQL text
22
- - `GET /text2sparql.json?question=...&dataset=...` -> returns JSON with query + status
23
-
24
- Environment variables:
25
- - `OPENAI_API_KEY` required
26
- - `DBPEDIA_ENDPOINT_URL` optional override for DBpedia endpoint
27
- - `CORPORATE_ENDPOINT_URL` optional override for corporate endpoint
28
- - `CORPORATE_GRAPH_URI` optional override for the locally loaded corporate graph
29
- - `PORT` optional, defaults to `7860`
30
-
31
- Notes:
32
- - This package is intended to stay stable while research continues elsewhere.
33
- - It reuses the main repo code from `src/`, but the serving config is frozen here.
34
- - In the HF Space, DBpedia is restored from the snapshot bucket and the small
35
- 2026 corporate KG is loaded at boot from `data/corporate_2026/`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HF KG Endpoint Backend
2
 
3
+ This package now serves the Hugging Face Space as a **raw SPARQL endpoint proxy** over the locally booted Virtuoso instance.
4
 
5
+ It no longer acts as the active challenge Text2SPARQL generation API.
 
 
 
 
 
 
6
 
7
+ ## Active Routes
 
 
 
 
8
 
 
9
  - `GET /health`
10
+ - `GET /sparql/dbpedia`
11
+ - `POST /sparql/dbpedia`
12
+ - `GET /sparql/corporate`
13
+ - `POST /sparql/corporate`
14
+
15
+ ## Behavior
16
+
17
+ - DBpedia proxy forwards to the local DBpedia Virtuoso endpoint restored from the snapshot bucket.
18
+ - Corporate proxy forwards to the same local Virtuoso instance, but through the corporate graph-aware endpoint URL.
19
+ - The startup flow is unchanged:
20
+ - restore DBpedia snapshot
21
+ - start Virtuoso
22
+ - load the corporate graph
23
+ - start FastAPI
24
+
25
+ ## Environment Variables
26
+
27
+ - `DBPEDIA_ENDPOINT_URL` optional override for the internal DBpedia upstream
28
+ - `CORPORATE_ENDPOINT_URL` optional override for the internal corporate upstream
29
+ - `CORPORATE_GRAPH_URI` optional corporate graph URI override
30
+ - `PORT` optional FastAPI port, default `7860`
31
+
32
+ ## Expected Public URLs
33
+
34
+ If the Space URL is:
35
+
36
+ - `https://insanalex-iris-at-text2sparql.hf.space`
37
+
38
+ then the public raw SPARQL endpoints are:
39
+
40
+ - `https://insanalex-iris-at-text2sparql.hf.space/sparql/dbpedia`
41
+ - `https://insanalex-iris-at-text2sparql.hf.space/sparql/corporate`
42
+
43
+ ## Notes
44
+
45
+ - The old challenge-specific `/text2sparql` routes are no longer part of the active HF app.
46
+ - The main repository still contains the repair pipeline code for local paper experiments; this deployment package now focuses only on stable KG access.
deployment/hf_eval_backend/app.py CHANGED
@@ -1,20 +1,18 @@
1
  from __future__ import annotations
2
 
3
  import os
4
- import uuid
5
  from functools import lru_cache
6
  from pathlib import Path
7
 
8
- from fastapi import FastAPI, HTTPException
 
9
  from fastapi.responses import PlainTextResponse
10
  from pydantic import BaseModel
11
 
12
  from src.config import RuntimeConfig, load_config
13
- from src.models import QueryRequest
14
- from src.pipeline import Text2SPARQLPipeline
15
 
16
 
17
- APP_TITLE = "TEXT2SPARQL Evaluation Backend"
18
  PROJECT_ROOT = Path(__file__).resolve().parents[2]
19
  BASE_CONFIG_PATH = PROJECT_ROOT / "configs" / "default.yaml"
20
  FROZEN_CONFIG_PATH = (
@@ -22,29 +20,29 @@ FROZEN_CONFIG_PATH = (
22
  / "configs"
23
  / "final_gpt5mini_single_general_5_5_internal_max_local_edit.yaml"
24
  )
25
-
26
- SUPPORTED_DATASETS = {
27
- "https://text2sparql.aksw.org/2025/dbpedia/",
28
- "https://text2sparql.aksw.org/2025/corporate/",
29
- "https://text2sparql.aksw.org/2026/dbpedia/",
30
- "https://text2sparql.aksw.org/2026/corporate/",
 
 
 
 
 
 
31
  }
32
 
33
 
34
  class HealthResponse(BaseModel):
35
  status: str
36
  service: str
37
- dbpedia_endpoint_url: str
38
- corporate_endpoint_url: str
39
- config_path: str
40
-
41
-
42
- class QueryResponse(BaseModel):
43
- dataset: str
44
- question: str
45
- query: str
46
- status: str
47
- request_id: str
48
 
49
 
50
  def _apply_endpoint_overrides(runtime: RuntimeConfig) -> RuntimeConfig:
@@ -76,9 +74,67 @@ def get_runtime() -> RuntimeConfig:
76
  return _apply_endpoint_overrides(runtime)
77
 
78
 
79
- @lru_cache(maxsize=1)
80
- def get_pipeline() -> Text2SPARQLPipeline:
81
- return Text2SPARQLPipeline(get_runtime())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84
  app = FastAPI(title=APP_TITLE)
@@ -86,73 +142,37 @@ app = FastAPI(title=APP_TITLE)
86
 
87
  @app.get("/health", response_model=HealthResponse)
88
  def health() -> HealthResponse:
89
- runtime = get_runtime()
90
  return HealthResponse(
91
  status="ok",
92
  service=APP_TITLE,
93
- dbpedia_endpoint_url=runtime.datasets["https://text2sparql.aksw.org/2026/dbpedia/"]["endpoint_url"],
94
- corporate_endpoint_url=runtime.datasets["https://text2sparql.aksw.org/2026/corporate/"]["endpoint_url"],
95
- config_path=str(FROZEN_CONFIG_PATH),
 
 
96
  )
97
 
98
 
99
- @app.get("/")
100
- def root(question: str | None = None, dataset: str | None = None):
101
- # Challenge client calls the API at "/" with dataset/question query params.
102
- if question is not None and dataset is not None:
103
- query, status, request_id = _run_query(dataset, question)
104
- return QueryResponse(
105
- dataset=dataset,
106
- question=question,
107
- query=query,
108
- status=status,
109
- request_id=request_id,
110
- )
111
- return PlainTextResponse("ok")
112
-
113
-
114
- def _run_query(dataset: str, question: str) -> tuple[str, str, str]:
115
- if dataset not in SUPPORTED_DATASETS:
116
- raise HTTPException(status_code=404, detail="Unsupported dataset identifier.")
117
-
118
- request_id = f"eval_{uuid.uuid4().hex[:12]}"
119
- request = QueryRequest(
120
- request_id=request_id,
121
- dataset_id=dataset,
122
- question=question,
123
- language=None,
124
- )
125
- trace = get_pipeline().run(request)
126
- if not trace.final_query:
127
- raise HTTPException(status_code=500, detail=f"Pipeline returned empty query ({trace.final_status}).")
128
- return trace.final_query, trace.final_status, request_id
129
-
130
-
131
- @app.get("/text2sparql", response_model=QueryResponse)
132
- def text2sparql(question: str, dataset: str) -> QueryResponse:
133
- query, status, request_id = _run_query(dataset, question)
134
- return QueryResponse(
135
- dataset=dataset,
136
- question=question,
137
- query=query,
138
- status=status,
139
- request_id=request_id,
140
  )
141
 
142
 
143
- @app.get("/text2sparql.raw", response_class=PlainTextResponse)
144
- def text2sparql_raw(question: str, dataset: str) -> str:
145
- query, _, _ = _run_query(dataset, question)
146
- return query
147
 
148
 
149
- @app.get("/text2sparql.json", response_model=QueryResponse)
150
- def text2sparql_json(question: str, dataset: str) -> QueryResponse:
151
- query, status, request_id = _run_query(dataset, question)
152
- return QueryResponse(
153
- dataset=dataset,
154
- question=question,
155
- query=query,
156
- status=status,
157
- request_id=request_id,
158
- )
 
1
  from __future__ import annotations
2
 
3
  import os
 
4
  from functools import lru_cache
5
  from pathlib import Path
6
 
7
+ import requests
8
+ from fastapi import FastAPI, HTTPException, Request, Response
9
  from fastapi.responses import PlainTextResponse
10
  from pydantic import BaseModel
11
 
12
  from src.config import RuntimeConfig, load_config
 
 
13
 
14
 
15
+ APP_TITLE = "TEXT2SPARQL KG Endpoint Backend"
16
  PROJECT_ROOT = Path(__file__).resolve().parents[2]
17
  BASE_CONFIG_PATH = PROJECT_ROOT / "configs" / "default.yaml"
18
  FROZEN_CONFIG_PATH = (
 
20
  / "configs"
21
  / "final_gpt5mini_single_general_5_5_internal_max_local_edit.yaml"
22
  )
23
+ REQUEST_TIMEOUT_SEC = 180
24
+ EXCLUDED_PROXY_HEADERS = {
25
+ "connection",
26
+ "content-length",
27
+ "content-encoding",
28
+ "keep-alive",
29
+ "proxy-authenticate",
30
+ "proxy-authorization",
31
+ "te",
32
+ "trailer",
33
+ "transfer-encoding",
34
+ "upgrade",
35
  }
36
 
37
 
38
  class HealthResponse(BaseModel):
39
  status: str
40
  service: str
41
+ mode: str
42
+ dbpedia_proxy_path: str
43
+ corporate_proxy_path: str
44
+ dbpedia_upstream_url: str
45
+ corporate_upstream_url: str
 
 
 
 
 
 
46
 
47
 
48
  def _apply_endpoint_overrides(runtime: RuntimeConfig) -> RuntimeConfig:
 
74
  return _apply_endpoint_overrides(runtime)
75
 
76
 
77
+ def _upstream_url(kind: str) -> str:
78
+ runtime = get_runtime()
79
+ dataset_id_map = {
80
+ "dbpedia": "https://text2sparql.aksw.org/2026/dbpedia/",
81
+ "corporate": "https://text2sparql.aksw.org/2026/corporate/",
82
+ }
83
+ dataset_id = dataset_id_map[kind]
84
+ return runtime.datasets[dataset_id]["endpoint_url"]
85
+
86
+
87
+ def _proxy_headers(request: Request) -> dict[str, str]:
88
+ headers: dict[str, str] = {}
89
+ for name in ("accept", "content-type", "user-agent"):
90
+ value = request.headers.get(name)
91
+ if value:
92
+ headers[name] = value
93
+ return headers
94
+
95
+
96
+ def _response_from_upstream(upstream: requests.Response) -> Response:
97
+ headers = {
98
+ key: value
99
+ for key, value in upstream.headers.items()
100
+ if key.lower() not in EXCLUDED_PROXY_HEADERS
101
+ }
102
+ return Response(
103
+ content=upstream.content,
104
+ status_code=upstream.status_code,
105
+ headers=headers,
106
+ )
107
+
108
+
109
+ async def _proxy_sparql(request: Request, kind: str) -> Response:
110
+ endpoint_url = _upstream_url(kind)
111
+ params = list(request.query_params.multi_items())
112
+ headers = _proxy_headers(request)
113
+
114
+ try:
115
+ if request.method == "GET":
116
+ upstream = requests.get(
117
+ endpoint_url,
118
+ params=params,
119
+ headers=headers,
120
+ timeout=REQUEST_TIMEOUT_SEC,
121
+ )
122
+ else:
123
+ body = await request.body()
124
+ upstream = requests.post(
125
+ endpoint_url,
126
+ params=params,
127
+ data=body,
128
+ headers=headers,
129
+ timeout=REQUEST_TIMEOUT_SEC,
130
+ )
131
+ except requests.RequestException as exc:
132
+ raise HTTPException(
133
+ status_code=502,
134
+ detail=f"Upstream {kind} SPARQL endpoint request failed: {exc}",
135
+ ) from exc
136
+
137
+ return _response_from_upstream(upstream)
138
 
139
 
140
  app = FastAPI(title=APP_TITLE)
 
142
 
143
  @app.get("/health", response_model=HealthResponse)
144
  def health() -> HealthResponse:
 
145
  return HealthResponse(
146
  status="ok",
147
  service=APP_TITLE,
148
+ mode="sparql_proxy",
149
+ dbpedia_proxy_path="/sparql/dbpedia",
150
+ corporate_proxy_path="/sparql/corporate",
151
+ dbpedia_upstream_url=_upstream_url("dbpedia"),
152
+ corporate_upstream_url=_upstream_url("corporate"),
153
  )
154
 
155
 
156
+ @app.get("/", response_class=PlainTextResponse)
157
+ def root() -> str:
158
+ return (
159
+ "ok\n"
160
+ "Available SPARQL proxy endpoints:\n"
161
+ " /sparql/dbpedia\n"
162
+ " /sparql/corporate\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  )
164
 
165
 
166
+ @app.get("/sparql", response_class=PlainTextResponse)
167
+ def sparql_index() -> str:
168
+ return "Use /sparql/dbpedia or /sparql/corporate"
 
169
 
170
 
171
+ @app.api_route("/sparql/dbpedia", methods=["GET", "POST"])
172
+ async def sparql_dbpedia(request: Request) -> Response:
173
+ return await _proxy_sparql(request, "dbpedia")
174
+
175
+
176
+ @app.api_route("/sparql/corporate", methods=["GET", "POST"])
177
+ async def sparql_corporate(request: Request) -> Response:
178
+ return await _proxy_sparql(request, "corporate")