Jupytext Example¶

This notebook demonstrates Jupytext functionality - the ability to work with notebooks in both .ipynb and .py formats.

Benefits of Jupytext¶

Version control friendly: Python scripts are easier to diff and merge
Text editor support: Edit notebooks in your favorite text editor
Automatic synchronization: Changes in either format sync automatically
Reduced merge conflicts: Text-based format reduces Git conflicts

1. Basic Python Code {#basic-python}¶

Let's start with some basic Python examples to show how code cells work in Jupytext format.

In [ ]:

Copied!





# Import required libraries
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

print("Libraries imported successfully!")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Current time: {datetime.now()}")
# Import required libraries
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

print("Libraries imported successfully!")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Current time: {datetime.now()}")

In [ ]:

Copied!





# Create some sample data
np.random.seed(42)  # For reproducible results

# Generate sample data
data = {
    "x": np.linspace(0, 10, 100),
    "y": np.sin(np.linspace(0, 10, 100)) + np.random.normal(0, 0.1, 100),
    "category": np.random.choice(["A", "B", "C"], 100),
}

df = pd.DataFrame(data)
print("Sample data created:")
print(df.head())
print(f"Data shape: {df.shape}")
# Create some sample data
np.random.seed(42)  # For reproducible results

# Generate sample data
data = {
    "x": np.linspace(0, 10, 100),
    "y": np.sin(np.linspace(0, 10, 100)) + np.random.normal(0, 0.1, 100),
    "category": np.random.choice(["A", "B", "C"], 100),
}

df = pd.DataFrame(data)
print("Sample data created:")
print(df.head())
print(f"Data shape: {df.shape}")

2. Data Visualization {#data-visualization}¶

Now let's create some visualizations to demonstrate plotting capabilities.

In [ ]:

Copied!





# Create a simple plot
plt.figure(figsize=(12, 4))

# Subplot 1: Line plot
plt.subplot(1, 2, 1)
plt.plot(df["x"], df["y"], "b-", alpha=0.7, label="Noisy sine wave")
plt.plot(df["x"], np.sin(df["x"]), "r--", label="Pure sine wave")
plt.title("Line Plot Example")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)

# Subplot 2: Scatter plot by category
plt.subplot(1, 2, 2)
for cat in df["category"].unique():
    mask = df["category"] == cat
    plt.scatter(df[mask]["x"], df[mask]["y"], label=f"Category {cat}", alpha=0.7)

plt.title("Scatter Plot by Category")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
# Create a simple plot
plt.figure(figsize=(12, 4))

# Subplot 1: Line plot
plt.subplot(1, 2, 1)
plt.plot(df["x"], df["y"], "b-", alpha=0.7, label="Noisy sine wave")
plt.plot(df["x"], np.sin(df["x"]), "r--", label="Pure sine wave")
plt.title("Line Plot Example")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)

# Subplot 2: Scatter plot by category
plt.subplot(1, 2, 2)
for cat in df["category"].unique():
    mask = df["category"] == cat
    plt.scatter(df[mask]["x"], df[mask]["y"], label=f"Category {cat}", alpha=0.7)

plt.title("Scatter Plot by Category")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [ ]:

Copied!





# Summary statistics by category
print("Summary statistics by category:")
summary = (
    df.groupby("category")
    .agg({"x": ["mean", "std"], "y": ["mean", "std", "count"]})
    .round(3)
)

print(summary)
# Summary statistics by category
print("Summary statistics by category:")
summary = (
    df.groupby("category")
    .agg({"x": ["mean", "std"], "y": ["mean", "std", "count"]})
    .round(3)
)

print(summary)

3. Interactive Elements {#interactive-elements}¶

Jupytext works well with interactive elements and widgets.

In [ ]:

Copied!





# Create an interactive data exploration function


def explore_data(n_points=100, noise_level=0.1, frequency=1.0):
    """
    Generate and plot sample data with configurable parameters

    Parameters:
    - n_points: Number of data points to generate
    - noise_level: Amount of noise to add (0.0 to 1.0)
    - frequency: Frequency of the sine wave
    """
    x = np.linspace(0, 4 * np.pi, n_points)
    y_clean = np.sin(frequency * x)
    y_noisy = y_clean + np.random.normal(0, noise_level, n_points)

    plt.figure(figsize=(10, 6))
    plt.plot(x, y_clean, "r--", linewidth=2, label="Clean signal")
    plt.plot(x, y_noisy, "b-", alpha=0.7, label="Noisy signal")
    plt.title(
        f"Interactive Data Exploration (n={n_points}, noise={noise_level}, freq={frequency})"
    )
    plt.xlabel("X values")
    plt.ylabel("Y values")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

    # Print some statistics
    correlation = np.corrcoef(y_clean, y_noisy)[0, 1]
    print(f"Correlation between clean and noisy signals: {correlation:.3f}")
    print(f"Signal-to-noise ratio: {np.std(y_clean) / np.std(y_noisy - y_clean):.3f}")


# Test with default parameters
explore_data()
# Create an interactive data exploration function


def explore_data(n_points=100, noise_level=0.1, frequency=1.0):
    """
    Generate and plot sample data with configurable parameters

    Parameters:
    - n_points: Number of data points to generate
    - noise_level: Amount of noise to add (0.0 to 1.0)
    - frequency: Frequency of the sine wave
    """
    x = np.linspace(0, 4 * np.pi, n_points)
    y_clean = np.sin(frequency * x)
    y_noisy = y_clean + np.random.normal(0, noise_level, n_points)

    plt.figure(figsize=(10, 6))
    plt.plot(x, y_clean, "r--", linewidth=2, label="Clean signal")
    plt.plot(x, y_noisy, "b-", alpha=0.7, label="Noisy signal")
    plt.title(
        f"Interactive Data Exploration (n={n_points}, noise={noise_level}, freq={frequency})"
    )
    plt.xlabel("X values")
    plt.ylabel("Y values")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

    # Print some statistics
    correlation = np.corrcoef(y_clean, y_noisy)[0, 1]
    print(f"Correlation between clean and noisy signals: {correlation:.3f}")
    print(f"Signal-to-noise ratio: {np.std(y_clean) / np.std(y_noisy - y_clean):.3f}")


# Test with default parameters
explore_data()

In [ ]:

Copied!





# Test with different parameters
print("\\n" + "=" * 50)
print("Testing with different parameters:")
print("=" * 50)

explore_data(n_points=50, noise_level=0.3, frequency=2.0)
# Test with different parameters
print("\\n" + "=" * 50)
print("Testing with different parameters:")
print("=" * 50)

explore_data(n_points=50, noise_level=0.3, frequency=2.0)

Jupytext Features Demonstrated¶

This notebook shows several key Jupytext features:

Format Flexibility¶

This file exists as both .py and .ipynb formats
Edit in either format and changes sync automatically
Version control works better with .py format

Cell Types¶

Code cells: Regular Python code (marked with # %%)
Markdown cells: Documentation (marked with # %% [markdown])
Raw cells: Unprocessed content (if needed)

Metadata Preservation¶

Notebook metadata is preserved in the YAML header
Cell execution order and output are maintained
Compatible with standard Jupyter workflow

Best Practices¶

Use descriptive cell comments for better readability
Keep functions and classes in separate cells for modularity
Add markdown documentation to explain complex logic
Use consistent formatting for easier collaboration

This example demonstrates the power of Jupytext for maintaining notebooks in a version-control friendly format while preserving full Jupyter functionality.

In [ ]:

Copied!





# Final example: Create a simple data processing pipeline


def data_pipeline():
    """A simple data processing pipeline example"""

    # Step 1: Generate synthetic data
    np.random.seed(123)
    dates = pd.date_range("2023-01-01", periods=365, freq="D")
    values = np.cumsum(np.random.randn(365)) + 100
    trend = np.linspace(0, 50, 365)
    noisy_values = values + trend + np.random.normal(0, 5, 365)

    # Step 2: Create DataFrame
    pipeline_df = pd.DataFrame(
        {"date": dates, "value": noisy_values, "trend": trend, "baseline": values}
    )

    # Step 3: Add computed columns
    pipeline_df["moving_avg"] = pipeline_df["value"].rolling(window=30).mean()
    pipeline_df["month"] = pipeline_df["date"].dt.month
    pipeline_df["quarter"] = pipeline_df["date"].dt.quarter

    # Step 4: Create visualization
    plt.figure(figsize=(15, 8))

    plt.subplot(2, 1, 1)
    plt.plot(
        pipeline_df["date"], pipeline_df["value"], "b-", alpha=0.5, label="Raw values"
    )
    plt.plot(
        pipeline_df["date"],
        pipeline_df["moving_avg"],
        "r-",
        linewidth=2,
        label="30-day moving average",
    )
    plt.plot(
        pipeline_df["date"], pipeline_df["trend"] + 100, "g--", label="Underlying trend"
    )
    plt.title("Time Series Data Processing Pipeline")
    plt.ylabel("Values")
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(2, 1, 2)
    quarterly_avg = pipeline_df.groupby("quarter")["value"].mean()
    plt.bar(
        quarterly_avg.index,
        quarterly_avg.values,
        alpha=0.7,
        color=["skyblue", "lightcoral", "lightgreen", "gold"],
    )
    plt.title("Quarterly Averages")
    plt.xlabel("Quarter")
    plt.ylabel("Average Value")
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Step 5: Return summary statistics
    summary_stats = {
        "total_records": len(pipeline_df),
        "date_range": f"{pipeline_df['date'].min()} to {pipeline_df['date'].max()}",
        "value_stats": {
            "mean": pipeline_df["value"].mean(),
            "std": pipeline_df["value"].std(),
            "min": pipeline_df["value"].min(),
            "max": pipeline_df["value"].max(),
        },
        "quarterly_averages": quarterly_avg.to_dict(),
    }

    return pipeline_df, summary_stats


# Run the pipeline
result_df, stats = data_pipeline()

print("Pipeline Summary:")
print(f"Records processed: {stats['total_records']}")
print(f"Date range: {stats['date_range']}")
print(f"Value statistics: {stats['value_stats']}")
print(f"Quarterly averages: {stats['quarterly_averages']}")
# Final example: Create a simple data processing pipeline


def data_pipeline():
    """A simple data processing pipeline example"""

    # Step 1: Generate synthetic data
    np.random.seed(123)
    dates = pd.date_range("2023-01-01", periods=365, freq="D")
    values = np.cumsum(np.random.randn(365)) + 100
    trend = np.linspace(0, 50, 365)
    noisy_values = values + trend + np.random.normal(0, 5, 365)

    # Step 2: Create DataFrame
    pipeline_df = pd.DataFrame(
        {"date": dates, "value": noisy_values, "trend": trend, "baseline": values}
    )

    # Step 3: Add computed columns
    pipeline_df["moving_avg"] = pipeline_df["value"].rolling(window=30).mean()
    pipeline_df["month"] = pipeline_df["date"].dt.month
    pipeline_df["quarter"] = pipeline_df["date"].dt.quarter

    # Step 4: Create visualization
    plt.figure(figsize=(15, 8))

    plt.subplot(2, 1, 1)
    plt.plot(
        pipeline_df["date"], pipeline_df["value"], "b-", alpha=0.5, label="Raw values"
    )
    plt.plot(
        pipeline_df["date"],
        pipeline_df["moving_avg"],
        "r-",
        linewidth=2,
        label="30-day moving average",
    )
    plt.plot(
        pipeline_df["date"], pipeline_df["trend"] + 100, "g--", label="Underlying trend"
    )
    plt.title("Time Series Data Processing Pipeline")
    plt.ylabel("Values")
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.subplot(2, 1, 2)
    quarterly_avg = pipeline_df.groupby("quarter")["value"].mean()
    plt.bar(
        quarterly_avg.index,
        quarterly_avg.values,
        alpha=0.7,
        color=["skyblue", "lightcoral", "lightgreen", "gold"],
    )
    plt.title("Quarterly Averages")
    plt.xlabel("Quarter")
    plt.ylabel("Average Value")
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Step 5: Return summary statistics
    summary_stats = {
        "total_records": len(pipeline_df),
        "date_range": f"{pipeline_df['date'].min()} to {pipeline_df['date'].max()}",
        "value_stats": {
            "mean": pipeline_df["value"].mean(),
            "std": pipeline_df["value"].std(),
            "min": pipeline_df["value"].min(),
            "max": pipeline_df["value"].max(),
        },
        "quarterly_averages": quarterly_avg.to_dict(),
    }

    return pipeline_df, summary_stats


# Run the pipeline
result_df, stats = data_pipeline()

print("Pipeline Summary:")
print(f"Records processed: {stats['total_records']}")
print(f"Date range: {stats['date_range']}")
print(f"Value statistics: {stats['value_stats']}")
print(f"Quarterly averages: {stats['quarterly_averages']}")