Importing Data in Python Cheat Sheet

Most real-world data science starts with importing data. Whether it’s CSV files, Excel spreadsheets, databases, or JSON from APIs, you need to know how to get data into Python efficiently.

The Working Directory

Python needs to know where to find your files. The working directory is crucial.

1
import os
2

3
# Get current working directory
4
wd = os.getcwd()
5
print(f"Current directory: {wd}")
6

7
# List files in current directory
8
os.listdir(wd)
9

10
# Change working directory
11
os.chdir("/path/to/new/directory")
12

13
# Useful operations
14
os.rename("old_file.txt", "new_file.txt")  # Rename file
15
os.remove("unwanted_file.txt")              # Delete file
16
os.mkdir("new_folder")                      # Create folder

Importing with Pandas

Pandas is your Swiss Army knife for data imports. Learn these well.

CSV Files

The most common format in data science.

1
import pandas as pd
2

3
# Basic CSV import
4
df = pd.read_csv('data.csv')
5

6
# With custom parameters
7
df = pd.read_csv(
8
    'data.csv',
9
    nrows=1000,          # Read first 1000 rows
10
    header=0,            # Row 0 as column names
11
    sep=',',             # Comma delimiter
12
    index_col=0,         # First column as index
13
    na_values=['NA', 'N/A', ''],  # Recognize as missing
14
    dtype={'column_name': str}     # Specify data types
15
)
16

17
# Common options
18
df = pd.read_csv(
19
    'messy_data.csv',
20
    skiprows=[0, 1],     # Skip header rows
21
    usecols=[0, 2, 5],    # Select specific columns
22
    encoding='utf-8',     # Handle special characters
23
    low_memory=False      # For large files
24
)

Excel Files

Handling spreadsheets with multiple sheets.

1
# Read Excel file
2
file = 'sales_data.xlsx'
3
excel_data = pd.ExcelFile(file)
4

5
# Get sheet names
6
print(excel_data.sheet_names)
7
# ['Sheet1', 'Sheet2', 'Sheet3']
8

9
# Read specific sheet
10
df = pd.read_excel(file, sheet_name=0)  # First sheet
11
df = pd.read_excel(file, sheet_name='Sales')  # By name
12

13
# Read with custom options
14
df = pd.read_excel(
15
    file,
16
    sheet_name='Sales',
17
    skiprows=1,
18
    names=['Date', 'Amount', 'Category'],
19
    parse_dates=['Date']
20
)
21

22
# Read all sheets into dictionary
23
all_sheets = pd.read_excel(file, sheet_name=None)

JSON Files

Work with JSON data from APIs and config files.

1
# From JSON file
2
df = pd.read_json('data.json')
3

4
# From JSON string
5
import json
6

7
json_string = '{"name": "Alice", "age": 25}'
8
data = json.loads(json_string)
9
df = pd.DataFrame([data])
10

11
# Nested JSON
12
with open('nested_data.json', 'r') as f:
13
    data = json.load(f)
14
    df = pd.json_normalize(data)  # Flatten nested structure

Importing with NumPy

NumPy is faster for numerical data but less flexible than pandas.

Flat Files with NumPy

1
import numpy as np
2

3
# Simple text file
4
data = np.loadtxt('data.txt')
5

6
# With custom delimiter
7
data = np.loadtxt('comma_separated.txt', delimiter=',')
8

9
# Skip rows and select columns
10
data = np.loadtxt(
11
    'data.txt',
12
    delimiter=',',
13
    skiprows=2,           # Skip header rows
14
    usecols=[0, 2, 4],    # Select specific columns
15
    dtype=float
16
)
17

18
# Files with mixed data types
19
data = np.genfromtxt(
20
    'mixed_data.csv',
21
    delimiter=',',
22
    names=True,           # Use first row as column names
23
    dtype=None            # Auto-detect types
24
)
25

26
# Even simpler for CSV
27
data = np.recfromcsv('data.csv')  # Auto-detects delimiter

Relational Databases

Query databases directly with pandas and SQLAlchemy.

Setup Database Connection

1
from sqlalchemy import create_engine
2

3
# Create connection
4
engine = create_engine('sqlite:///my_database.db')
5

6
# For SQL Server
7
engine = create_engine('sqlite://Northwind.sqlite')
8

9
# List all tables
10
table_names = engine.table_names()
11
print(f"Available tables: {table_names}")

Querying Databases

1
import pandas as pd
2

3
# Method 1: Using pandas directly
4
df = pd.read_sql_query(
5
    "SELECT * FROM Orders WHERE Year = 2024",
6
    engine
7
)
8

9
# Method 2: Using SQLAlchemy connection
10
with engine.connect() as con:
11
    rs = con.execute("SELECT * FROM Products")
12
    df = pd.DataFrame(rs.fetchall())
13
    df.columns = rs.keys()
14

15
# Fetch in chunks (for large datasets)
16
with engine.connect() as con:
17
    rs = con.execute("SELECT * FROM LargeTable")
18
    df = pd.DataFrame(rs.fetchmany(size=1000))  # Get 1000 rows

Other File Formats

Stata Files (.dta)

1
# Read Stata files
2
df = pd.read_stata('survey_data.dta')

SAS Files (.sas7bdat)

1
from sas7bdat import SAS7BDAT
2

3
# Using context manager
4
with SAS7BDAT('data.sas7bdat') as file:
5
    df = file.to_data_frame()
6

7
# Direct import
8
df = pd.read_sas('data.sas7bdat')

MATLAB Files (.mat)

1
import scipy.io
2

3
mat = scipy.io.loadmat('workspace.mat')
4

5
# Explore structure
6
print(mat.keys())     # Dictionary keys
7
print(mat.items())    # Key-value pairs
8

9
# Access data
10
data = mat['variable_name']

HDF5 Files

1
import h5py
2

3
data = h5py.File('data.hdf5', 'r')
4

5
# Explore structure
6
print(data.keys())
7

8
# Navigate nested structure
9
for key in data['meta'].keys():
10
    print(key)
11
    # Description, DescriptionURL, Detector, etc.
12

13
# Read specific data
14
description = data['meta']['Description'].value

Pickled Files (.pkl)

1
import pickle
2

3
# Load pickled data
4
with open('data.pkl', 'rb') as file:
5
    pickled_data = pickle.load(file)
6

7
# Pickled files are Python-specific but preserve exact Python objects

Using the Context Manager

Always use with statements for file operations—they automatically handle cleanup.

1
# Reading text files
2
with open('data.txt', 'r') as file:
3
    text = file.read()          # Read entire file
4
    first_line = file.readline()  # Read one line
5
    lines = file.readlines()      # Read all lines into list
6

7
# File automatically closes when done
8
print(f"File closed: {file.closed}")  # True
9

10
# Writing files
11
with open('output.txt', 'w') as file:
12
    file.write("Line 1\n")
13
    file.write("Line 2\n")

Why use context managers?

Automatic cleanup if errors occur
No need to remember file.close()
More Pythonic
Prevents resource leaks

Exploring Your Data

Once imported, always explore before analysis.

Pandas DataFrames

1
# Basic exploration
2
df.head()          # First 5 rows
3
df.tail()          # Last 5 rows
4
df.head(10)        # First 10 rows
5

6
# Structure
7
df.shape           # (rows, columns)
8
df.info()          # Data types and missing values
9
df.describe()      # Summary statistics
10
df.columns         # Column names
11
df.index           # Row indices
12

13
# Convert to NumPy
14
data_array = df.values  # As NumPy array

NumPy Arrays

1
# Array properties
2
data_array.dtype   # Data type of elements
3
data_array.shape   # Dimensions
4
len(data_array)    # Length
5

6
# Quick stats
7
data_array.min()   # Minimum value
8
data_array.max()   # Maximum value
9
data_array.mean()  # Mean
10
data_array.std()   # Standard deviation

Magic Commands (Jupyter Notebooks)

If you’re working in Jupyter, these commands are gold:

# File system commands
!ls                    # List files
!pwd                   # Current directory
!cd ..                 # Change directory
!mkdir new_folder      # Create folder

# File operations
!head data.csv         # Preview file
!wc -l data.csv        # Count lines
!cat file.txt          # Display file contents

# Install packages
!pip install package_name

# System info
!python --version

Real-World Examples

Example 1: Importing Sales Data

1
import pandas as pd
2

3
# Multiple CSV files
4
import glob
5

6
# Find all CSV files
7
csv_files = glob.glob('sales/*.csv')
8

9
# Combine into one DataFrame
10
dfs = []
11
for file in csv_files:
12
    df = pd.read_csv(file)
13
    dfs.append(df)
14

15
combined_df = pd.concat(dfs, ignore_index=True)

Example 2: Reading from Database

1
from sqlalchemy import create_engine
2
import pandas as pd
3

4
# Connect to database
5
engine = create_engine('sqlite:///sales.db')
6

7
# Query and import
8
query = """
9
    SELECT
10
        DATE(OrderDate) as date,
11
        SUM(Amount) as daily_sales
12
    FROM Orders
13
    WHERE YEAR(OrderDate) = 2024
14
    GROUP BY DATE(OrderDate)
15
"""
16

17
sales_df = pd.read_sql_query(query, engine)

Example 3: Handling Missing Data During Import

1
df = pd.read_csv(
2
    'data.csv',
3
    na_values=['NA', 'NULL', 'N/A', '', '?', '-999']
4
)
5

6
# Check what was imported as missing
7
print(df.isnull().sum())  # Count missing per column
8
print(df.head())
9

10
# Fill missing during import
11
df = pd.read_csv(
12
    'data.csv',
13
    na_values=['NA'],
14
    keep_default_na=True,
15
    fillna=0  # Replace NaN with 0
16
)

Quick Reference

Most Common Import Patterns

1
# CSV
2
pd.read_csv('file.csv')
3

4
# Excel
5
pd.read_excel('file.xlsx', sheet_name=0)
6

7
# JSON
8
pd.read_json('file.json')
9

10
# Database
11
pd.read_sql_query('SELECT * FROM table', engine)
12

13
# NumPy array
14
np.loadtxt('file.txt', delimiter=',')
15

16
# Text file
17
with open('file.txt', 'r') as f:
18
    data = f.read()

Essential Parameters

These work for both CSV and Excel:

1
sep=','           # Delimiter/separator
2
header=0          # Row to use as column names
3
skiprows=[0, 1]   # Rows to skip
4
nrows=1000        # Number of rows to read
5
usecols=[0, 2, 5] # Column indices to read
6
index_col=0       # Column to use as index
7
na_values=['NA']   # Values to treat as missing
8
encoding='utf-8'   # File encoding

Excel-specific:

1
sheet_name=0       # Sheet number or name
2
parse_dates=True   # Parse date columns

Putting it all together:

1
df = pd.read_csv(
2
    'data.csv',
3
    sep=',',           # Comma delimiter
4
    header=0,          # First row has headers
5
    nrows=10000,       # Read 10k rows
6
    usecols=[0, 1, 3], # Select specific columns
7
    na_values=['N/A'], # Treat 'N/A' as missing
8
    encoding='utf-8'   # Handle special chars
9
)

Pro Tips

✅ Always specify encoding for international data: encoding='utf-8'
✅ Use low_memory=False for large CSV files to avoid type inference issues
✅ Set dtype explicitly when you know the expected types
✅ Read in chunks for very large files: pd.read_csv(file, chunksize=10000)
✅ Context managers are your friend—always use with statements
✅ Check data types immediately after import with df.dtypes
✅ Handle encoding errors gracefully: errors='ignore'