Day 81 – BI Architecture and Data Modeling

Why it matters

Business intelligence teams translate raw data into governed insights. A clear architecture keeps ingestion, storage, and analytics aligned so analysts can trust their data assets and ship faster.

Agenda

Map core data architecture layers, from ingestion pipelines to semantic models.
Compare centralized warehouses, flexible lakes, and focused marts for BI delivery.
Evaluate modeling trade-offs between star and snowflake patterns.
Clarify fact/dimension roles, calculated measures, and semantic model governance.

Star schema example

The build_star_schema_example helper in solutions.py models a retail analytics star schema with a fact_sales table and conformed date, customer, product, and store dimensions. Use the Power BI star schema guidance for a diagrammed walkthrough that mirrors the classroom example.

Snowflake schema example

The build_snowflake_schema_example helper extends the retail model by normalizing the product hierarchy into dedicated tables. Compare the metadata with the Snowflake TPC-DS reference models to see how snowflaked product dimensions appear in SQL DDL.

Classroom resources

lesson.py prints the roadmap topics and walks through both schema examples with references.
solutions.py exposes grouped topic metadata plus helper functions for converting schema examples into structured dictionaries suitable for tests, slides, or demos.
tests/test_day_81.py (added in this task) ensures the helpers surface all expected titles and schema metadata.

Additional Topic: Operational BI Governance

This lesson is part of the Phase 5 Business Intelligence specialization. Use the Phase 5 overview to see how the developer-roadmap topics align across Days 68–84.

Why it matters

Codify the controls that keep BI programs trustworthy.

Developer-roadmap alignment

Data Lineage
Data Quality
Privacy
Bias Recognition

Next steps

Draft case studies and notebooks that exercise these roadmap nodes.
Update the Phase 5 cheat sheet with the insights you capture here.

Previous: Day 80 – Day 80 – BI Data Quality and Governance • Next: Day 82 – Day 82 – BI ETL and Pipeline Automation

You are on lesson 81 of 108.

Additional Materials

lesson.py

View on GitHub

lesson.py

# %%
"""Day 81 – BI Architecture and Data Modeling classroom script."""

# %%
from __future__ import annotations

import pandas as pd

from Day_81_BI_Architecture_and_Data_Modeling import (
    SchemaExample,
    build_snowflake_schema_example,
    build_star_schema_example,
    build_topic_dataframe,
)

# %%
STAR_SCHEMA_RESOURCE = "https://learn.microsoft.com/power-bi/guidance/star-schema"
SNOWFLAKE_SCHEMA_RESOURCE = "https://docs.snowflake.com/en/user-guide/sample-data-tpcds"

TOPIC_FRAME = build_topic_dataframe()
STAR_SCHEMA = build_star_schema_example()
SNOWFLAKE_SCHEMA = build_snowflake_schema_example()


# %%
def schema_to_dataframe(schema: SchemaExample) -> pd.DataFrame:
    """Convert schema metadata to a tabular view for classroom walkthroughs."""

    records: list[dict[str, str]] = []
    fact = schema["fact_table"]
    records.append(
        {
            "table": fact.get("name", ""),
            "kind": "fact",
            "grain": fact.get("grain", ""),
            "keys": ", ".join(fact.get("keys", [])),
            "business_fields": ", ".join(fact.get("measures", [])),
            "references": ", ".join(
                f"{key} → {target}"
                for key, target in fact.get("references", {}).items()
            ),
        }
    )
    for dimension in schema["dimensions"]:
        records.append(
            {
                "table": dimension.get("name", ""),
                "kind": "dimension",
                "grain": dimension.get("grain", ""),
                "keys": ", ".join(dimension.get("keys", [])),
                "business_fields": ", ".join(
                    dimension.get("attributes", dimension.get("measures", []))
                ),
                "references": ", ".join(
                    f"{key} → {target}"
                    for key, target in dimension.get("references", {}).items()
                ),
            }
        )
    return pd.DataFrame(
        records,
        columns=["table", "kind", "grain", "keys", "business_fields", "references"],
    )


# %%
def summarize_topics(frame: pd.DataFrame) -> None:
    """Print the taxonomy of Day 81 topics."""

    print("\nDay 81 architecture and modeling roadmap\n")
    print(frame.to_markdown(index=False))


# %%
def review_schema(title: str, schema: SchemaExample, resource_url: str) -> None:
    """Display schema metadata and provide a reference link for diagrams or SQL models."""

    summary = schema_to_dataframe(schema)
    print(f"\n{title}\n")
    print(summary.to_markdown(index=False))
    print(f"Reference: {resource_url}\n")
    print(schema["commentary"])


# %%
def main() -> None:
    """Run the classroom demo for Day 81."""

    summarize_topics(TOPIC_FRAME)
    review_schema("Retail star schema", STAR_SCHEMA, STAR_SCHEMA_RESOURCE)
    review_schema(
        "Retail snowflake schema", SNOWFLAKE_SCHEMA, SNOWFLAKE_SCHEMA_RESOURCE
    )


# %%
if __name__ == "__main__":
    main()

solutions.py

View on GitHub

solutions.py

"""Utilities for the Day 81 BI architecture and modeling lesson."""

from __future__ import annotations

from typing import Dict, Iterable, Mapping, Sequence, TypedDict, cast

import pandas as pd

from mypackage.bi_curriculum import BiTopic, topics_by_titles

SECTION_TITLES: Mapping[str, list[str]] = {
    "Architectures": [
        "Data Architectures",
        "Cloud BI Ecosystem",
        "Data Warehouse",
        "Data Lake",
        "Data Mart",
    ],
    "Modeling patterns": [
        "Star vs Snowflake Schema",
        "Normalization vs Denormalization",
        "Fact vs Dimension Tables",
        "Calculated Fields & Measures",
        "Data Modeling for BI",
    ],
}

TOPIC_DESCRIPTIONS: Mapping[str, str] = {
    "Data Architectures": (
        "Position the overall data platform blueprint that connects sources, storage, and analytics "
        "layers."
    ),
    "Cloud BI Ecosystem": (
        "Highlight managed services and integration patterns that modernize analytics delivery."
    ),
    "Data Warehouse": (
        "Explain curated, structured storage optimized for governed reporting workloads."
    ),
    "Data Lake": (
        "Describe flexible storage that retains raw, semi-structured, and streaming data feeds."
    ),
    "Data Mart": (
        "Discuss subject-area presentation layers tailored to department-specific questions."
    ),
    "Star vs Snowflake Schema": (
        "Compare modeling layouts that balance query simplicity with normalization discipline."
    ),
    "Normalization vs Denormalization": (
        "Assess trade-offs between update efficiency, storage, and analytic performance."
    ),
    "Fact vs Dimension Tables": (
        "Clarify table roles, grains, and relationships when designing BI semantic layers."
    ),
    "Calculated Fields & Measures": (
        "Show how derived metrics provide reusable business logic for dashboards and self-service."
    ),
    "Data Modeling for BI": (
        "Connect modeling choices to governance, scalability, and downstream decision-making."
    ),
}


class SchemaTable(TypedDict, total=False):
    """Structure describing a table in an example schema."""

    name: str
    grain: str
    keys: Sequence[str]
    measures: Sequence[str]
    attributes: Sequence[str]
    references: Mapping[str, str]


class SchemaExample(TypedDict):
    """Container for schema-level metadata used in classroom demos."""

    fact_table: SchemaTable
    dimensions: Sequence[SchemaTable]
    commentary: str


def load_topics(
    *, sections: Mapping[str, Iterable[str]] = SECTION_TITLES
) -> Dict[str, list[BiTopic]]:
    """Return roadmap topics grouped by the requested sections."""

    grouped_topics: Dict[str, list[BiTopic]] = {}
    for section, titles in sections.items():
        grouped_topics[section] = topics_by_titles(list(titles))
    return grouped_topics


def build_topic_dataframe(
    *,
    sections: Mapping[str, Iterable[str]] = SECTION_TITLES,
    descriptions: Mapping[str, str] = TOPIC_DESCRIPTIONS,
) -> pd.DataFrame:
    """Return a DataFrame describing the BI architecture and modeling taxonomy."""

    records: list[dict[str, str]] = []
    for section, topics in load_topics(sections=sections).items():
        for topic in topics:
            records.append(
                {
                    "section": section,
                    "title": topic.title,
                    "description": descriptions.get(topic.title, ""),
                }
            )
    frame = pd.DataFrame(records, columns=["section", "title", "description"])
    if frame.empty:
        return frame
    return frame.drop_duplicates(subset=["title"], keep="first").reset_index(drop=True)


def build_star_schema_example() -> SchemaExample:
    """Return metadata for a classroom retail sales star schema."""

    fact_sales: SchemaTable = {
        "name": "fact_sales",
        "grain": "one row per order line",
        "keys": ["date_key", "customer_key", "product_key", "store_key"],
        "measures": ["sales_amount", "quantity", "discount_amount", "gross_margin"],
        "references": {
            "date_key": "dim_date.date_key",
            "customer_key": "dim_customer.customer_key",
            "product_key": "dim_product.product_key",
            "store_key": "dim_store.store_key",
        },
    }

    dimensions: list[SchemaTable] = [
        {
            "name": "dim_date",
            "grain": "one row per calendar day",
            "keys": ["date_key"],
            "attributes": [
                "date",
                "day_of_week",
                "month",
                "quarter",
                "year",
                "holiday_flag",
            ],
        },
        {
            "name": "dim_customer",
            "grain": "one row per customer",
            "keys": ["customer_key"],
            "attributes": [
                "customer_name",
                "customer_segment",
                "lifetime_value_band",
                "loyalty_tier",
            ],
        },
        {
            "name": "dim_product",
            "grain": "one row per SKU",
            "keys": ["product_key"],
            "attributes": [
                "product_name",
                "brand",
                "category",
                "subcategory",
            ],
        },
        {
            "name": "dim_store",
            "grain": "one row per physical or digital storefront",
            "keys": ["store_key"],
            "attributes": [
                "store_name",
                "channel",
                "region",
            ],
        },
    ]

    commentary = (
        "Classic star schema anchored on a sales fact table with conformed dimensions to simplify "
        "self-service reporting and aggregate navigation."
    )
    return cast(
        SchemaExample,
        {"fact_table": fact_sales, "dimensions": dimensions, "commentary": commentary},
    )


def build_snowflake_schema_example() -> SchemaExample:
    """Return metadata for a snowflake schema extending the retail example."""

    star = build_star_schema_example()
    dim_product = next(
        dim for dim in star["dimensions"] if dim["name"] == "dim_product"
    )

    hierarchical_dimensions: list[SchemaTable] = list(star["dimensions"])
    product_category: SchemaTable = {
        "name": "dim_product_category",
        "grain": "one row per product subcategory",
        "keys": ["subcategory_key"],
        "attributes": ["subcategory", "category_key"],
        "references": {"category_key": "dim_product_category_group.category_key"},
    }
    product_category_group: SchemaTable = {
        "name": "dim_product_category_group",
        "grain": "one row per product category",
        "keys": ["category_key"],
        "attributes": ["category", "department"],
    }
    hierarchical_dimensions.extend([product_category, product_category_group])

    augmented_fact: SchemaTable = {
        **star["fact_table"],
        "references": {
            **star["fact_table"].get("references", {}),
            "product_key": "dim_product.product_key",
        },
    }

    commentary = (
        "Snowflake schema normalizes the product hierarchy into separate tables to reduce "
        "duplication while preserving downstream joins."
    )

    dimensions: list[SchemaTable] = []
    for dim in hierarchical_dimensions:
        if dim["name"] == "dim_product":
            dimensions.append(
                {
                    **dim_product,
                    "references": {
                        "subcategory_key": "dim_product_category.subcategory_key"
                    },
                }
            )
        else:
            dimensions.append(dim)

    return cast(
        SchemaExample,
        {
            "fact_table": augmented_fact,
            "dimensions": dimensions,
            "commentary": commentary,
        },
    )


__all__ = [
    "build_snowflake_schema_example",
    "build_star_schema_example",
    "build_topic_dataframe",
    "load_topics",
    "SchemaExample",
    "SchemaTable",
]