Jupytext Example¶
This notebook demonstrates Jupytext functionality - the ability to work with notebooks in both .ipynb
and .py
formats.
Benefits of Jupytext¶
- Version control friendly: Python scripts are easier to diff and merge
- Text editor support: Edit notebooks in your favorite text editor
- Automatic synchronization: Changes in either format sync automatically
- Reduced merge conflicts: Text-based format reduces Git conflicts
Table of Contents¶
1. Basic Python Code {#basic-python}¶
Let's start with some basic Python examples to show how code cells work in Jupytext format.
In [ ]:
Copied!
# Import required libraries
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
print("Libraries imported successfully!")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Current time: {datetime.now()}")
# Import required libraries
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
print("Libraries imported successfully!")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Current time: {datetime.now()}")
In [ ]:
Copied!
# Create some sample data
np.random.seed(42) # For reproducible results
# Generate sample data
data = {
"x": np.linspace(0, 10, 100),
"y": np.sin(np.linspace(0, 10, 100)) + np.random.normal(0, 0.1, 100),
"category": np.random.choice(["A", "B", "C"], 100),
}
df = pd.DataFrame(data)
print("Sample data created:")
print(df.head())
print(f"Data shape: {df.shape}")
# Create some sample data
np.random.seed(42) # For reproducible results
# Generate sample data
data = {
"x": np.linspace(0, 10, 100),
"y": np.sin(np.linspace(0, 10, 100)) + np.random.normal(0, 0.1, 100),
"category": np.random.choice(["A", "B", "C"], 100),
}
df = pd.DataFrame(data)
print("Sample data created:")
print(df.head())
print(f"Data shape: {df.shape}")
2. Data Visualization {#data-visualization}¶
Now let's create some visualizations to demonstrate plotting capabilities.
In [ ]:
Copied!
# Create a simple plot
plt.figure(figsize=(12, 4))
# Subplot 1: Line plot
plt.subplot(1, 2, 1)
plt.plot(df["x"], df["y"], "b-", alpha=0.7, label="Noisy sine wave")
plt.plot(df["x"], np.sin(df["x"]), "r--", label="Pure sine wave")
plt.title("Line Plot Example")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)
# Subplot 2: Scatter plot by category
plt.subplot(1, 2, 2)
for cat in df["category"].unique():
mask = df["category"] == cat
plt.scatter(df[mask]["x"], df[mask]["y"], label=f"Category {cat}", alpha=0.7)
plt.title("Scatter Plot by Category")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Create a simple plot
plt.figure(figsize=(12, 4))
# Subplot 1: Line plot
plt.subplot(1, 2, 1)
plt.plot(df["x"], df["y"], "b-", alpha=0.7, label="Noisy sine wave")
plt.plot(df["x"], np.sin(df["x"]), "r--", label="Pure sine wave")
plt.title("Line Plot Example")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)
# Subplot 2: Scatter plot by category
plt.subplot(1, 2, 2)
for cat in df["category"].unique():
mask = df["category"] == cat
plt.scatter(df[mask]["x"], df[mask]["y"], label=f"Category {cat}", alpha=0.7)
plt.title("Scatter Plot by Category")
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
In [ ]:
Copied!
# Summary statistics by category
print("Summary statistics by category:")
summary = (
df.groupby("category")
.agg({"x": ["mean", "std"], "y": ["mean", "std", "count"]})
.round(3)
)
print(summary)
# Summary statistics by category
print("Summary statistics by category:")
summary = (
df.groupby("category")
.agg({"x": ["mean", "std"], "y": ["mean", "std", "count"]})
.round(3)
)
print(summary)
3. Interactive Elements {#interactive-elements}¶
Jupytext works well with interactive elements and widgets.
In [ ]:
Copied!
# Create an interactive data exploration function
def explore_data(n_points=100, noise_level=0.1, frequency=1.0):
"""
Generate and plot sample data with configurable parameters
Parameters:
- n_points: Number of data points to generate
- noise_level: Amount of noise to add (0.0 to 1.0)
- frequency: Frequency of the sine wave
"""
x = np.linspace(0, 4 * np.pi, n_points)
y_clean = np.sin(frequency * x)
y_noisy = y_clean + np.random.normal(0, noise_level, n_points)
plt.figure(figsize=(10, 6))
plt.plot(x, y_clean, "r--", linewidth=2, label="Clean signal")
plt.plot(x, y_noisy, "b-", alpha=0.7, label="Noisy signal")
plt.title(
f"Interactive Data Exploration (n={n_points}, noise={noise_level}, freq={frequency})"
)
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Print some statistics
correlation = np.corrcoef(y_clean, y_noisy)[0, 1]
print(f"Correlation between clean and noisy signals: {correlation:.3f}")
print(f"Signal-to-noise ratio: {np.std(y_clean) / np.std(y_noisy - y_clean):.3f}")
# Test with default parameters
explore_data()
# Create an interactive data exploration function
def explore_data(n_points=100, noise_level=0.1, frequency=1.0):
"""
Generate and plot sample data with configurable parameters
Parameters:
- n_points: Number of data points to generate
- noise_level: Amount of noise to add (0.0 to 1.0)
- frequency: Frequency of the sine wave
"""
x = np.linspace(0, 4 * np.pi, n_points)
y_clean = np.sin(frequency * x)
y_noisy = y_clean + np.random.normal(0, noise_level, n_points)
plt.figure(figsize=(10, 6))
plt.plot(x, y_clean, "r--", linewidth=2, label="Clean signal")
plt.plot(x, y_noisy, "b-", alpha=0.7, label="Noisy signal")
plt.title(
f"Interactive Data Exploration (n={n_points}, noise={noise_level}, freq={frequency})"
)
plt.xlabel("X values")
plt.ylabel("Y values")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Print some statistics
correlation = np.corrcoef(y_clean, y_noisy)[0, 1]
print(f"Correlation between clean and noisy signals: {correlation:.3f}")
print(f"Signal-to-noise ratio: {np.std(y_clean) / np.std(y_noisy - y_clean):.3f}")
# Test with default parameters
explore_data()
In [ ]:
Copied!
# Test with different parameters
print("\\n" + "=" * 50)
print("Testing with different parameters:")
print("=" * 50)
explore_data(n_points=50, noise_level=0.3, frequency=2.0)
# Test with different parameters
print("\\n" + "=" * 50)
print("Testing with different parameters:")
print("=" * 50)
explore_data(n_points=50, noise_level=0.3, frequency=2.0)
Jupytext Features Demonstrated¶
This notebook shows several key Jupytext features:
Format Flexibility¶
- This file exists as both
.py
and.ipynb
formats - Edit in either format and changes sync automatically
- Version control works better with
.py
format
Cell Types¶
- Code cells: Regular Python code (marked with
# %%
) - Markdown cells: Documentation (marked with
# %% [markdown]
) - Raw cells: Unprocessed content (if needed)
Metadata Preservation¶
- Notebook metadata is preserved in the YAML header
- Cell execution order and output are maintained
- Compatible with standard Jupyter workflow
Best Practices¶
- Use descriptive cell comments for better readability
- Keep functions and classes in separate cells for modularity
- Add markdown documentation to explain complex logic
- Use consistent formatting for easier collaboration
This example demonstrates the power of Jupytext for maintaining notebooks in a version-control friendly format while preserving full Jupyter functionality.
In [ ]:
Copied!
# Final example: Create a simple data processing pipeline
def data_pipeline():
"""A simple data processing pipeline example"""
# Step 1: Generate synthetic data
np.random.seed(123)
dates = pd.date_range("2023-01-01", periods=365, freq="D")
values = np.cumsum(np.random.randn(365)) + 100
trend = np.linspace(0, 50, 365)
noisy_values = values + trend + np.random.normal(0, 5, 365)
# Step 2: Create DataFrame
pipeline_df = pd.DataFrame(
{"date": dates, "value": noisy_values, "trend": trend, "baseline": values}
)
# Step 3: Add computed columns
pipeline_df["moving_avg"] = pipeline_df["value"].rolling(window=30).mean()
pipeline_df["month"] = pipeline_df["date"].dt.month
pipeline_df["quarter"] = pipeline_df["date"].dt.quarter
# Step 4: Create visualization
plt.figure(figsize=(15, 8))
plt.subplot(2, 1, 1)
plt.plot(
pipeline_df["date"], pipeline_df["value"], "b-", alpha=0.5, label="Raw values"
)
plt.plot(
pipeline_df["date"],
pipeline_df["moving_avg"],
"r-",
linewidth=2,
label="30-day moving average",
)
plt.plot(
pipeline_df["date"], pipeline_df["trend"] + 100, "g--", label="Underlying trend"
)
plt.title("Time Series Data Processing Pipeline")
plt.ylabel("Values")
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(2, 1, 2)
quarterly_avg = pipeline_df.groupby("quarter")["value"].mean()
plt.bar(
quarterly_avg.index,
quarterly_avg.values,
alpha=0.7,
color=["skyblue", "lightcoral", "lightgreen", "gold"],
)
plt.title("Quarterly Averages")
plt.xlabel("Quarter")
plt.ylabel("Average Value")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Step 5: Return summary statistics
summary_stats = {
"total_records": len(pipeline_df),
"date_range": f"{pipeline_df['date'].min()} to {pipeline_df['date'].max()}",
"value_stats": {
"mean": pipeline_df["value"].mean(),
"std": pipeline_df["value"].std(),
"min": pipeline_df["value"].min(),
"max": pipeline_df["value"].max(),
},
"quarterly_averages": quarterly_avg.to_dict(),
}
return pipeline_df, summary_stats
# Run the pipeline
result_df, stats = data_pipeline()
print("Pipeline Summary:")
print(f"Records processed: {stats['total_records']}")
print(f"Date range: {stats['date_range']}")
print(f"Value statistics: {stats['value_stats']}")
print(f"Quarterly averages: {stats['quarterly_averages']}")
# Final example: Create a simple data processing pipeline
def data_pipeline():
"""A simple data processing pipeline example"""
# Step 1: Generate synthetic data
np.random.seed(123)
dates = pd.date_range("2023-01-01", periods=365, freq="D")
values = np.cumsum(np.random.randn(365)) + 100
trend = np.linspace(0, 50, 365)
noisy_values = values + trend + np.random.normal(0, 5, 365)
# Step 2: Create DataFrame
pipeline_df = pd.DataFrame(
{"date": dates, "value": noisy_values, "trend": trend, "baseline": values}
)
# Step 3: Add computed columns
pipeline_df["moving_avg"] = pipeline_df["value"].rolling(window=30).mean()
pipeline_df["month"] = pipeline_df["date"].dt.month
pipeline_df["quarter"] = pipeline_df["date"].dt.quarter
# Step 4: Create visualization
plt.figure(figsize=(15, 8))
plt.subplot(2, 1, 1)
plt.plot(
pipeline_df["date"], pipeline_df["value"], "b-", alpha=0.5, label="Raw values"
)
plt.plot(
pipeline_df["date"],
pipeline_df["moving_avg"],
"r-",
linewidth=2,
label="30-day moving average",
)
plt.plot(
pipeline_df["date"], pipeline_df["trend"] + 100, "g--", label="Underlying trend"
)
plt.title("Time Series Data Processing Pipeline")
plt.ylabel("Values")
plt.legend()
plt.grid(True, alpha=0.3)
plt.subplot(2, 1, 2)
quarterly_avg = pipeline_df.groupby("quarter")["value"].mean()
plt.bar(
quarterly_avg.index,
quarterly_avg.values,
alpha=0.7,
color=["skyblue", "lightcoral", "lightgreen", "gold"],
)
plt.title("Quarterly Averages")
plt.xlabel("Quarter")
plt.ylabel("Average Value")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Step 5: Return summary statistics
summary_stats = {
"total_records": len(pipeline_df),
"date_range": f"{pipeline_df['date'].min()} to {pipeline_df['date'].max()}",
"value_stats": {
"mean": pipeline_df["value"].mean(),
"std": pipeline_df["value"].std(),
"min": pipeline_df["value"].min(),
"max": pipeline_df["value"].max(),
},
"quarterly_averages": quarterly_avg.to_dict(),
}
return pipeline_df, summary_stats
# Run the pipeline
result_df, stats = data_pipeline()
print("Pipeline Summary:")
print(f"Records processed: {stats['total_records']}")
print(f"Date range: {stats['date_range']}")
print(f"Value statistics: {stats['value_stats']}")
print(f"Quarterly averages: {stats['quarterly_averages']}")