usage-guide.RmdVersion: 4.0.0 Purpose: Cross-platform R package for accessing IDEA Public Schools’ SQL Server data warehouse
Before using ideadata, ensure you have:
# Install from GitHub using remotes
install.packages("remotes")
remotes::install_github("idea-analytics/ideadata")
# Or using renv (recommended for project reproducibility)
renv::install("idea-analytics/ideadata")Run the interactive credential setup:
You’ll be prompted for:
firstname.lastname)ODBC Driver 17 for SQL Server
This creates/updates your .Renviron file with:
IDEA_RNA_DB_UID=firstname.lastname
IDEA_RNA_DB_PWD=your_password
IDEA_RNA_ODBC_DRIVER=ODBC Driver 17 for SQL Server
Test your connection with a simple query:
schools <- get_schools() %>% collect()If successful, you’re ready to use the package!
CRITICAL CONCEPT: All get_*() functions
return lazy query objects, not actual data.
# This does NOT pull data into R
students <- get_students()
class(students)
# [1] "tbl_Microsoft SQL Server" "tbl_dbi" "tbl_sql" "tbl_lazy" "tbl"
# The query is only executed when you call collect()
students_data <- get_students() %>% collect()
# 1. Build your query (lazy, instant)
query <- get_students() %>%
filter(grade_level >= 9) %>%
select(student_number, first_name, last_name, schoolid)
# 2. Preview the SQL (optional)
show_query(query)
# 3. Pull the data (this executes on SQL Server)
data <- collect(query)get_table() for Any Table
# If you know the exact table name
my_table <- get_table(.table_name = "Students")
# If table name is ambiguous, you'll get helpful error with options:
# Error: Found 3 tables named 'Students':
# [Copy-paste code for each option shown]
# Specify database to disambiguate
students <- get_table(
.table_name = "Students",
.database_name = "PowerSchool"
)
# Browse all available tables in a viewer
view_warehouse_metadata()
# Search for tables containing a keyword
warehouse_meta_data %>%
filter(str_detect(table_name, "attendance"))
# Students with attendance
student_attendance <- get_students() %>%
left_join(
get_student_daily_attendance(),
by = c("id" = "studentid")
) %>%
filter(att_date >= "2024-01-01") %>%
collect()create_connection(.database_name, .server_name)
Creates a new database connection.
# Usually not needed - connections are created automatically
# But useful for custom database access
con <- create_connection(
.database_name = "PowerSchool",
.server_name = "RGVPDSD-DWPRD1"
)check_get_connection(.database_name, .server_name)
Checks for existing valid connection, creates if needed.
# This is called internally by all get_* functions
# Returns connection object
con <- check_get_connection(.database_name = "PowerSchool")
# All students
students <- get_students() %>% collect()
# Only currently enrolled
current <- get_currently_enrolled_students() %>% collect()
# Academic summary
academic <- get_students_academic_summary() %>% collect()
# Continuous enrollment tracking
enrollment <- get_students_continuous_enrollment() %>% collect()
# All schools
schools <- get_schools() %>% collect()
# Regions
regions <- get_regions() %>% collect()
# Daily attendance
attendance <- get_student_daily_attendance() %>%
filter(att_date >= "2024-01-01") %>%
collect()
# Engagement attendance (remote vs in-person)
engagement <- get_student_engagement_attendance() %>%
collect()
# Math assessments
star_math <- get_renstar_math() %>%
filter(school_year == 2024) %>%
collect()
# Reading assessments
star_reading <- get_renstar_reading() %>%
filter(school_year == 2024) %>%
collect()
# Survey questions
questions <- get_aees_questions() %>% collect()
# Survey results
results <- get_aees_results() %>% collect()
# Open-ended responses
oe_responses <- get_aees_oe_responses() %>% collect()
# All AEES data (convenience wrapper)
all_aees <- get_aees() %>% collect()
# Survey questions
questions <- get_gptw_questions() %>% collect()
# Survey results
results <- get_gptw_results() %>% collect()
# Open-ended responses
oe_responses <- get_gptw_oe_responses() %>% collect()
# Historical persistence data
persistence <- get_persistence_historical() %>%
filter(school_year >= 2020) %>%
collect()
# Persistence codes
codes <- get_persistence_code() %>% collect()
# Persistence reason codes
reasons <- get_persistence_reasons_codes() %>% collect()
# Daily learning location (remote vs in-person)
location <- get_learning_location() %>%
filter(date >= "2024-01-01") %>%
collect()When pulling large datasets, SQL Server connections may timeout or fail.
collector()
The collector() function breaks large queries into
chunks and combines them.
# Instead of this (may fail):
# big_data <- get_student_daily_attendance() %>% collect()
# Use collector to pull in chunks by schoolid
big_data <- get_student_daily_attendance() %>%
collector(schoolid, verbose = TRUE)
# Chunk by both schoolid and yearid
very_big_data <- get_student_daily_attendance() %>%
collector(schoolid, yearid, verbose = TRUE)
# GOOD: Filter before collect()
students_2024 <- get_students() %>%
filter(schoolid == 12345, enroll_status == 0) %>%
collect()
# BAD: Pulls ALL students first, then filters in R
students_2024 <- get_students() %>%
collect() %>%
filter(schoolid == 12345, enroll_status == 0)
# GOOD: Select columns on server
students_subset <- get_students() %>%
select(student_number, first_name, last_name, grade_level) %>%
collect()
# AVOID: Pulling all 50+ columns when you only need 4
students_all <- get_students() %>% collect()
# Build your query
query <- get_students() %>%
filter(grade_level >= 9) %>%
select(student_number, first_name, last_name)
# Check the SQL that will be generated
show_query(query)
# Preview first few rows (fast)
head(query)
# Count rows (fast)
tally(query)
# Pull when satisfied
data <- collect(query)
# GOOD: Join lazy objects before collect()
combined <- get_students() %>%
inner_join(get_schools(), by = "schoolid") %>%
collect()
# LESS EFFICIENT: Pull both tables, then join in R
students <- get_students() %>% collect()
schools <- get_schools() %>% collect()
combined <- inner_join(students, schools, by = "schoolid")
# Connections are automatically reused
# No need to manually manage connections in normal usage
# If you need to close a connection:
disconnect(conn_PowerSchool)
# SQL Server dates work with standard R date filtering
recent_attendance <- get_student_daily_attendance() %>%
filter(att_date >= "2024-01-01", att_date < "2024-07-01") %>%
collect()
# Can also use lubridate functions (may vary in SQL translation)
recent_attendance <- get_student_daily_attendance() %>%
filter(att_date >= lubridate::ymd("2024-01-01")) %>%
collect()Problem: Table name is ambiguous or doesn’t exist.
Solution:
# Search for the table
view_warehouse_metadata()
# Or search programmatically
warehouse_meta_data %>%
filter(str_detect(table_name, "your_table_name"))
# Use full specification
data <- get_table(
.table_name = "exact_table_name",
.database_name = "database_name",
.server_name = "server_name"
)Checklist:
setup_creds() and restarted R?setup_creds()
again)Problem: Trying to pull too much data at once.
Solution: Use collector() to pull in
chunks:
Problem: Pulling too much data at once.
Solutions:
collector() for large datasetscollect()
select()
slice_sample(n = 1000) %>% collect()
library(ideadata)
library(dplyr)
# Get 2023-24 school year students with school info
sy_2024_roster <- get_students() %>%
filter(
entrydate <= "2024-08-31",
exitdate >= "2024-05-31" | is.na(exitdate)
) %>%
left_join(get_schools(), by = "schoolid") %>%
select(
student_number,
first_name,
last_name,
grade_level,
school_name = name,
region
) %>%
collect()
# Current year attendance rates by student
ytd_attendance <- get_student_daily_attendance() %>%
filter(att_date >= "2024-08-01", att_date <= Sys.Date()) %>%
group_by(studentid) %>%
summarize(
days_possible = n(),
days_present = sum(attendancevalue == 1, na.rm = TRUE),
attendance_rate = days_present / days_possible
) %>%
collect() %>%
left_join(
get_students() %>% collect(),
by = c("studentid" = "id")
)
# AEES response rates by region
aees_responses <- get_aees_results() %>%
collect()
# Calculate response rates
library(tidyr)
response_summary <- aees_responses %>%
group_by(region) %>%
summarize(
total_invited = n_distinct(employee_id),
total_responded = n_distinct(employee_id[!is.na(response)]),
response_rate = total_responded / total_invited
)
# Five years of persistence data
# This is large - use collector!
persistence_5yr <- get_persistence_historical() %>%
filter(school_year >= 2019) %>%
collector(school_year, verbose = TRUE)browseVignettes("ideadata") for detailed guides?ideadata or
help(package = "ideadata")
view_warehouse_metadata() to explore available tables
# Setup (first time only)
setup_creds() # Configure credentials
.rs.restartR() # Restart R session
# Loading
library(ideadata) # Load package
# Exploring
view_warehouse_metadata() # Browse available tables
warehouse_meta_data # View metadata tibble
# Getting Data
get_table("table_name") # Generic table access
get_students() # Student data
get_schools() # School data
get_student_daily_attendance() # Attendance data
# Working with Queries
query %>% show_query() # Preview SQL
query %>% head() # Preview first rows
query %>% tally() # Count rows
query %>% collect() # Pull data into R
# Large Datasets
query %>% collector(grouping_var) # Chunked data pull
# Troubleshooting
setup_creds() # Reset credentials
detach("package:ideadata", unload = TRUE)
library(ideadata) # Renew KerberosPackage Version: 4.0.0 Maintained by: IDEA Analytics Team