Source code for dataio.validate.validators.metadata
from __future__ import annotations
import re
from pydantic import ValidationError
from dataio.validate.contracts.models import DatasetManifest
from dataio.validate.reports.models import Finding, ValidationResult
from dataio.validate.validators.rules import apply_cross_field_rules
DATASET_ID_RE = re.compile(r"^[A-Z]{2}\d{4}DS\d{4}$")
SLUG_RE = re.compile(r"^(?P<dataset_id>[a-z]{2}\d{4}ds\d{4})-[a-z0-9]+(?:-[a-z0-9]+)*$")
[docs]
def validate_metadata_contract(
manifest: DatasetManifest,
result: ValidationResult,
) -> None:
try:
DatasetManifest.model_validate(manifest.model_dump())
except ValidationError as exc:
for error in exc.errors():
result.add_finding(
Finding(
severity="error",
code="manifest_contract_error",
message=error["msg"],
path=".".join(str(item) for item in error["loc"]),
rule_id="metadata_contract",
)
)
if manifest.datasetID is None:
result.add_finding(
Finding(
severity="error",
code="missing_dataset_id",
message="Manifest must define datasetID.",
path="datasetID",
rule_id="dataset_id_required",
)
)
elif DATASET_ID_RE.match(manifest.datasetID) is None:
result.add_finding(
Finding(
severity="error",
code="invalid_dataset_id",
message="datasetID must match the pattern AA0000DS0000.",
path="datasetID",
rule_id="dataset_id_pattern",
)
)
slug_match = SLUG_RE.match(manifest.datasetSlug)
if slug_match is None:
result.add_finding(
Finding(
severity="error",
code="invalid_dataset_slug",
message=(
"datasetSlug must start with the lowercase dataset ID and use "
"lowercase hyphenated words, e.g. cs0007ds0112-example-dataset."
),
path="datasetSlug",
rule_id="dataset_slug_pattern",
)
)
elif (
manifest.datasetID is not None
and slug_match.group("dataset_id") != manifest.datasetID.lower()
):
result.add_finding(
Finding(
severity="error",
code="dataset_slug_id_mismatch",
message="datasetSlug must begin with the lowercase datasetID followed by '-'.",
path="datasetSlug",
rule_id="dataset_slug_matches_id",
)
)
for table_name, table in manifest.datasetTables.items():
if not table.dataDictionary:
result.add_finding(
Finding(
severity="error",
code="missing_data_dictionary",
message="Each table must define a non-empty dataDictionary.",
path=f"datasetTables.{table_name}.dataDictionary",
table=table_name,
rule_id="table_requires_data_dictionary",
)
)
for field_name, field in table.dataDictionary.items():
if field.enumRef and field.enumRef not in manifest.enumDefinitions:
result.add_finding(
Finding(
severity="error",
code="unknown_enum_reference",
message=f"Field references unknown enum definition '{field.enumRef}'.",
path=f"datasetTables.{table_name}.dataDictionary.{field_name}.enumRef",
table=table_name,
field=field_name,
rule_id="enum_ref_must_resolve",
)
)
apply_cross_field_rules(manifest, result)