Usage Guide
1. Installation
The package can be installed directly from PyPI using pip:
pip install stata_codebook
2. Quick Start
Here's a quick example to get you started:
import pandas as pd
from stata_codebook import codebook
# Sample DataFrame
data = {
'age': [25, 30, 35, 40, None],
'income': [50000, 60000, 70000, 80000, 90000],
'gender': ['Male', 'Female', 'Female', 'Male', None],
'is_employed': [True, True, False, True, None]
}
df = pd.DataFrame(data)
# codebook for all dataset varaibles
codebook(df)
|
Variable |
Type |
Unique values |
Missing values |
Blank issues |
Range |
25th percentile |
50th percentile (Median) |
75th percentile |
Mean |
Examples |
Top categories |
SD |
95% CI |
Normality test |
p-value (normality) |
Top category proportion |
95% CI (top category) |
0 |
age |
float64 |
4 |
1 |
Not applicable |
(25.0, 40.0) |
28.75 |
32.5 |
36.25 |
32.5 |
[35.0, 25.0, 30.0] |
- |
- |
- |
- |
- |
NaN |
NaN |
1 |
income |
int64 |
5 |
0 |
Not applicable |
(50000, 90000) |
60000.0 |
70000.0 |
80000.0 |
70000.0 |
[70000, 50000, 60000] |
- |
- |
- |
- |
- |
NaN |
NaN |
2 |
gender |
object |
2 |
1 |
No blanks detected |
- |
- |
- |
- |
- |
[Female, Male, Female] |
{'Male': 2, 'Female': 2} |
- |
NaN |
- |
- |
- |
- |
3 |
is_employed |
object |
2 |
1 |
No blanks detected |
- |
- |
- |
- |
- |
[False, True, True] |
{True: 3, False: 1} |
- |
NaN |
- |
- |
- |
- |
# codebook for specific column in the dataset
codebook(df, column='income') # numerical column
|
Variable |
Type |
Unique values |
Missing values |
Blank issues |
Range |
25th percentile |
50th percentile (Median) |
75th percentile |
Mean |
Examples |
Top categories |
SD |
95% CI |
Normality test |
p-value (normality) |
0 |
income |
int64 |
5 |
0 |
Not applicable |
(50000, 90000) |
60000.0 |
70000.0 |
80000.0 |
70000.0 |
[70000, 50000, 60000] |
- |
- |
- |
- |
- |
# codebook for specific column in the dataset
codebook(df, column='gender') # categorical column
|
Variable |
Type |
Unique values |
Missing values |
Blank issues |
Examples |
Top categories |
Range |
25th percentile |
50th percentile (Median) |
75th percentile |
Mean |
SD |
Normality test |
p-value (normality) |
Top category proportion |
95% CI (top category) |
0 |
gender |
object |
2 |
1 |
No blanks detected |
[Female, Male, Female] |
{'Male': 2, 'Female': 2} |
- |
- |
- |
- |
- |
- |
- |
- |
- |
- |
# codebook for specific column in the dataset additional statistics
codebook(df, advanced=True)
|
Variable |
Type |
Unique values |
Missing values |
Blank issues |
Range |
25th percentile |
50th percentile (Median) |
75th percentile |
Mean |
Examples |
Top categories |
SD |
95% CI |
Normality test |
p-value (normality) |
Top category proportion |
95% CI (top category) |
0 |
age |
float64 |
4 |
1 |
Not applicable |
(25.0, 40.0) |
28.75 |
32.5 |
36.25 |
32.5 |
[35.0, 25.0, 30.0] |
- |
6.455 |
(26.174, 38.826) |
Shapiro-Wilk |
0.972 |
NaN |
NaN |
1 |
income |
int64 |
5 |
0 |
Not applicable |
(50000, 90000) |
60000.0 |
70000.0 |
80000.0 |
70000.0 |
[70000, 50000, 60000] |
- |
15811.388 |
(56140.707, 83859.293) |
Shapiro-Wilk |
0.967 |
NaN |
NaN |
2 |
gender |
object |
2 |
1 |
No blanks detected |
- |
- |
- |
- |
- |
[Female, Male, Female] |
{'Male': 2, 'Female': 2} |
- |
NaN |
- |
- |
0.50 |
(0.01, 0.99) |
3 |
is_employed |
object |
2 |
1 |
No blanks detected |
- |
- |
- |
- |
- |
[False, True, True] |
{True: 3, False: 1} |
- |
NaN |
- |
- |
0.75 |
(0.326, 1.174) |