Importing Data in Python Cheat Sheet
Overview

Importing Data in Python Cheat Sheet

· December 15, 2025 · 7 min read

Most real-world data science starts with importing data. Whether it’s CSV files, Excel spreadsheets, databases, or JSON from APIs, you need to know how to get data into Python efficiently.

The Working Directory

Python needs to know where to find your files. The working directory is crucial.

import os
# Get current working directory
wd = os.getcwd()
print(f"Current directory: {wd}")
# List files in current directory
os.listdir(wd)
# Change working directory
os.chdir("/path/to/new/directory")
# Useful operations
os.rename("old_file.txt", "new_file.txt") # Rename file
os.remove("unwanted_file.txt") # Delete file
os.mkdir("new_folder") # Create folder

Importing with Pandas

Pandas is your Swiss Army knife for data imports. Learn these well.

CSV Files

The most common format in data science.

import pandas as pd
# Basic CSV import
df = pd.read_csv('data.csv')
# With custom parameters
df = pd.read_csv(
'data.csv',
nrows=1000, # Read first 1000 rows
header=0, # Row 0 as column names
sep=',', # Comma delimiter
index_col=0, # First column as index
na_values=['NA', 'N/A', ''], # Recognize as missing
dtype={'column_name': str} # Specify data types
)
# Common options
df = pd.read_csv(
'messy_data.csv',
skiprows=[0, 1], # Skip header rows
usecols=[0, 2, 5], # Select specific columns
encoding='utf-8', # Handle special characters
low_memory=False # For large files
)

Excel Files

Handling spreadsheets with multiple sheets.

# Read Excel file
file = 'sales_data.xlsx'
excel_data = pd.ExcelFile(file)
# Get sheet names
print(excel_data.sheet_names)
# ['Sheet1', 'Sheet2', 'Sheet3']
# Read specific sheet
df = pd.read_excel(file, sheet_name=0) # First sheet
df = pd.read_excel(file, sheet_name='Sales') # By name
# Read with custom options
df = pd.read_excel(
file,
sheet_name='Sales',
skiprows=1,
names=['Date', 'Amount', 'Category'],
parse_dates=['Date']
)
# Read all sheets into dictionary
all_sheets = pd.read_excel(file, sheet_name=None)

JSON Files

Work with JSON data from APIs and config files.

# From JSON file
df = pd.read_json('data.json')
# From JSON string
import json
json_string = '{"name": "Alice", "age": 25}'
data = json.loads(json_string)
df = pd.DataFrame([data])
# Nested JSON
with open('nested_data.json', 'r') as f:
data = json.load(f)
df = pd.json_normalize(data) # Flatten nested structure

Importing with NumPy

NumPy is faster for numerical data but less flexible than pandas.

Flat Files with NumPy

import numpy as np
# Simple text file
data = np.loadtxt('data.txt')
# With custom delimiter
data = np.loadtxt('comma_separated.txt', delimiter=',')
# Skip rows and select columns
data = np.loadtxt(
'data.txt',
delimiter=',',
skiprows=2, # Skip header rows
usecols=[0, 2, 4], # Select specific columns
dtype=float
)
# Files with mixed data types
data = np.genfromtxt(
'mixed_data.csv',
delimiter=',',
names=True, # Use first row as column names
dtype=None # Auto-detect types
)
# Even simpler for CSV
data = np.recfromcsv('data.csv') # Auto-detects delimiter

Relational Databases

Query databases directly with pandas and SQLAlchemy.

Setup Database Connection

from sqlalchemy import create_engine
# Create connection
engine = create_engine('sqlite:///my_database.db')
# For SQL Server
engine = create_engine('sqlite://Northwind.sqlite')
# List all tables
table_names = engine.table_names()
print(f"Available tables: {table_names}")

Querying Databases

import pandas as pd
# Method 1: Using pandas directly
df = pd.read_sql_query(
"SELECT * FROM Orders WHERE Year = 2024",
engine
)
# Method 2: Using SQLAlchemy connection
with engine.connect() as con:
rs = con.execute("SELECT * FROM Products")
df = pd.DataFrame(rs.fetchall())
df.columns = rs.keys()
# Fetch in chunks (for large datasets)
with engine.connect() as con:
rs = con.execute("SELECT * FROM LargeTable")
df = pd.DataFrame(rs.fetchmany(size=1000)) # Get 1000 rows

Other File Formats

Stata Files (.dta)

# Read Stata files
df = pd.read_stata('survey_data.dta')

SAS Files (.sas7bdat)

from sas7bdat import SAS7BDAT
# Using context manager
with SAS7BDAT('data.sas7bdat') as file:
df = file.to_data_frame()
# Direct import
df = pd.read_sas('data.sas7bdat')

MATLAB Files (.mat)

import scipy.io
mat = scipy.io.loadmat('workspace.mat')
# Explore structure
print(mat.keys()) # Dictionary keys
print(mat.items()) # Key-value pairs
# Access data
data = mat['variable_name']

HDF5 Files

import h5py
data = h5py.File('data.hdf5', 'r')
# Explore structure
print(data.keys())
# Navigate nested structure
for key in data['meta'].keys():
print(key)
# Description, DescriptionURL, Detector, etc.
# Read specific data
description = data['meta']['Description'].value

Pickled Files (.pkl)

import pickle
# Load pickled data
with open('data.pkl', 'rb') as file:
pickled_data = pickle.load(file)
# Pickled files are Python-specific but preserve exact Python objects

Using the Context Manager

Always use with statements for file operations—they automatically handle cleanup.

# Reading text files
with open('data.txt', 'r') as file:
text = file.read() # Read entire file
first_line = file.readline() # Read one line
lines = file.readlines() # Read all lines into list
# File automatically closes when done
print(f"File closed: {file.closed}") # True
# Writing files
with open('output.txt', 'w') as file:
file.write("Line 1\n")
file.write("Line 2\n")

Why use context managers?

  • Automatic cleanup if errors occur
  • No need to remember file.close()
  • More Pythonic
  • Prevents resource leaks

Exploring Your Data

Once imported, always explore before analysis.

Pandas DataFrames

# Basic exploration
df.head() # First 5 rows
df.tail() # Last 5 rows
df.head(10) # First 10 rows
# Structure
df.shape # (rows, columns)
df.info() # Data types and missing values
df.describe() # Summary statistics
df.columns # Column names
df.index # Row indices
# Convert to NumPy
data_array = df.values # As NumPy array

NumPy Arrays

# Array properties
data_array.dtype # Data type of elements
data_array.shape # Dimensions
len(data_array) # Length
# Quick stats
data_array.min() # Minimum value
data_array.max() # Maximum value
data_array.mean() # Mean
data_array.std() # Standard deviation

Magic Commands (Jupyter Notebooks)

If you’re working in Jupyter, these commands are gold:

Terminal window
# File system commands
!ls # List files
!pwd # Current directory
!cd .. # Change directory
!mkdir new_folder # Create folder
# File operations
!head data.csv # Preview file
!wc -l data.csv # Count lines
!cat file.txt # Display file contents
# Install packages
!pip install package_name
# System info
!python --version

Real-World Examples

Example 1: Importing Sales Data

import pandas as pd
# Multiple CSV files
import glob
# Find all CSV files
csv_files = glob.glob('sales/*.csv')
# Combine into one DataFrame
dfs = []
for file in csv_files:
df = pd.read_csv(file)
dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)

Example 2: Reading from Database

from sqlalchemy import create_engine
import pandas as pd
# Connect to database
engine = create_engine('sqlite:///sales.db')
# Query and import
query = """
SELECT
DATE(OrderDate) as date,
SUM(Amount) as daily_sales
FROM Orders
WHERE YEAR(OrderDate) = 2024
GROUP BY DATE(OrderDate)
"""
sales_df = pd.read_sql_query(query, engine)

Example 3: Handling Missing Data During Import

df = pd.read_csv(
'data.csv',
na_values=['NA', 'NULL', 'N/A', '', '?', '-999']
)
# Check what was imported as missing
print(df.isnull().sum()) # Count missing per column
print(df.head())
# Fill missing during import
df = pd.read_csv(
'data.csv',
na_values=['NA'],
keep_default_na=True,
fillna=0 # Replace NaN with 0
)

Quick Reference

Most Common Import Patterns

# CSV
pd.read_csv('file.csv')
# Excel
pd.read_excel('file.xlsx', sheet_name=0)
# JSON
pd.read_json('file.json')
# Database
pd.read_sql_query('SELECT * FROM table', engine)
# NumPy array
np.loadtxt('file.txt', delimiter=',')
# Text file
with open('file.txt', 'r') as f:
data = f.read()

Essential Parameters

These work for both CSV and Excel:

sep=',' # Delimiter/separator
header=0 # Row to use as column names
skiprows=[0, 1] # Rows to skip
nrows=1000 # Number of rows to read
usecols=[0, 2, 5] # Column indices to read
index_col=0 # Column to use as index
na_values=['NA'] # Values to treat as missing
encoding='utf-8' # File encoding

Excel-specific:

sheet_name=0 # Sheet number or name
parse_dates=True # Parse date columns

Putting it all together:

df = pd.read_csv(
'data.csv',
sep=',', # Comma delimiter
header=0, # First row has headers
nrows=10000, # Read 10k rows
usecols=[0, 1, 3], # Select specific columns
na_values=['N/A'], # Treat 'N/A' as missing
encoding='utf-8' # Handle special chars
)

Pro Tips

Always specify encoding for international data: encoding='utf-8'
Use low_memory=False for large CSV files to avoid type inference issues
Set dtype explicitly when you know the expected types
Read in chunks for very large files: pd.read_csv(file, chunksize=10000)
Context managers are your friend—always use with statements
Check data types immediately after import with df.dtypes
Handle encoding errors gracefully: errors='ignore'