Importing Data in Python Cheat Sheet

Master data importing in Python. Learn to read CSV, Excel, databases, JSON, and more with practical examples.

Oct 26, 2025·7 min read·⏱ 0s··

SERIES

Data Science from Scratch

2 of 3

Most real-world data science starts with importing data. Whether it's CSV files, Excel spreadsheets, databases, or JSON from APIs, you need to know how to get data into Python efficiently.

Data comes in many formats. Master these patterns and you'll handle 95% of all data import scenarios!

The Working Directory

Python needs to know where to find your files. The working directory is crucial.

1import os
2
3# Get current working directory
4wd = os.getcwd()
5print(f"Current directory: {wd}")
6
7# List files in current directory
8os.listdir(wd)
9
10# Change working directory
11os.chdir("/path/to/new/directory")
12
13# Useful operations
14os.rename("old_file.txt", "new_file.txt")  # Rename file
15os.remove("unwanted_file.txt")              # Delete file
16os.mkdir("new_folder")                      # Create folder

1import os
2
3# Get current working directory
4wd = os.getcwd()
5print(f"Current directory: {wd}")
6
7# List files in current directory
8os.listdir(wd)
9
10# Change working directory
11os.chdir("/path/to/new/directory")
12
13# Useful operations
14os.rename("old_file.txt", "new_file.txt")  # Rename file
15os.remove("unwanted_file.txt")              # Delete file
16os.mkdir("new_folder")                      # Create folder

Importing with Pandas

Pandas is your Swiss Army knife for data imports. Learn these well.

CSV Files

The most common format in data science.

1import pandas as pd
2
3# Basic CSV import
4df = pd.read_csv('data.csv')
5
6# With custom parameters
7df = pd.read_csv(
8    'data.csv',
9    nrows=1000,          # Read first 1000 rows
10    header=0,            # Row 0 as column names
11    sep=',',             # Comma delimiter
12    index_col=0,         # First column as index
13    na_values=['NA', 'N/A', ''],  # Recognize as missing
14    dtype={'column_name': str}     # Specify data types
15)
16
17# Common options
18df = pd.read_csv(
19    'messy_data.csv',
20    skiprows=[0, 1],     # Skip header rows
21    usecols=[0, 2, 5],    # Select specific columns
22    encoding='utf-8',     # Handle special characters
23    low_memory=False      # For large files
24)

1import pandas as pd
2
3# Basic CSV import
4df = pd.read_csv('data.csv')
5
6# With custom parameters
7df = pd.read_csv(
8    'data.csv',
9    nrows=1000,          # Read first 1000 rows
10    header=0,            # Row 0 as column names
11    sep=',',             # Comma delimiter
12    index_col=0,         # First column as index
13    na_values=['NA', 'N/A', ''],  # Recognize as missing
14    dtype={'column_name': str}     # Specify data types
15)
16
17# Common options
18df = pd.read_csv(
19    'messy_data.csv',
20    skiprows=[0, 1],     # Skip header rows
21    usecols=[0, 2, 5],    # Select specific columns
22    encoding='utf-8',     # Handle special characters
23    low_memory=False      # For large files
24)

Excel Files

Handling spreadsheets with multiple sheets.

1# Read Excel file
2file = 'sales_data.xlsx'
3excel_data = pd.ExcelFile(file)
4
5# Get sheet names
6print(excel_data.sheet_names)
7# ['Sheet1', 'Sheet2', 'Sheet3']
8
9# Read specific sheet
10df = pd.read_excel(file, sheet_name=0)  # First sheet
11df = pd.read_excel(file, sheet_name='Sales')  # By name
12
13# Read with custom options
14df = pd.read_excel(
15    file,
16    sheet_name='Sales',
17    skiprows=1,
18    names=['Date', 'Amount', 'Category'],
19    parse_dates=['Date']
20)
21
22# Read all sheets into dictionary
23all_sheets = pd.read_excel(file, sheet_name=None)

1# Read Excel file
2file = 'sales_data.xlsx'
3excel_data = pd.ExcelFile(file)
4
5# Get sheet names
6print(excel_data.sheet_names)
7# ['Sheet1', 'Sheet2', 'Sheet3']
8
9# Read specific sheet
10df = pd.read_excel(file, sheet_name=0)  # First sheet
11df = pd.read_excel(file, sheet_name='Sales')  # By name
12
13# Read with custom options
14df = pd.read_excel(
15    file,
16    sheet_name='Sales',
17    skiprows=1,
18    names=['Date', 'Amount', 'Category'],
19    parse_dates=['Date']
20)
21
22# Read all sheets into dictionary
23all_sheets = pd.read_excel(file, sheet_name=None)

JSON Files

Work with JSON data from APIs and config files.

1# From JSON file
2df = pd.read_json('data.json')
3
4# From JSON string
5import json
6
7json_string = '{"name": "Alice", "age": 25}'
8data = json.loads(json_string)
9df = pd.DataFrame([data])
10
11# Nested JSON
12with open('nested_data.json', 'r') as f:
13    data = json.load(f)
14    df = pd.json_normalize(data)  # Flatten nested structure

1# From JSON file
2df = pd.read_json('data.json')
3
4# From JSON string
5import json
6
7json_string = '{"name": "Alice", "age": 25}'
8data = json.loads(json_string)
9df = pd.DataFrame([data])
10
11# Nested JSON
12with open('nested_data.json', 'r') as f:
13    data = json.load(f)
14    df = pd.json_normalize(data)  # Flatten nested structure

Importing with NumPy

NumPy is faster for numerical data but less flexible than pandas.

Flat Files with NumPy

1import numpy as np
2
3# Simple text file
4data = np.loadtxt('data.txt')
5
6# With custom delimiter
7data = np.loadtxt('comma_separated.txt', delimiter=',')
8
9# Skip rows and select columns
10data = np.loadtxt(
11    'data.txt',
12    delimiter=',',
13    skiprows=2,           # Skip header rows
14    usecols=[0, 2, 4],    # Select specific columns
15    dtype=float
16)
17
18# Files with mixed data types
19data = np.genfromtxt(
20    'mixed_data.csv',
21    delimiter=',',
22    names=True,           # Use first row as column names
23    dtype=None            # Auto-detect types
24)
25
26# Even simpler for CSV
27data = np.recfromcsv('data.csv')  # Auto-detects delimiter

1import numpy as np
2
3# Simple text file
4data = np.loadtxt('data.txt')
5
6# With custom delimiter
7data = np.loadtxt('comma_separated.txt', delimiter=',')
8
9# Skip rows and select columns
10data = np.loadtxt(
11    'data.txt',
12    delimiter=',',
13    skiprows=2,           # Skip header rows
14    usecols=[0, 2, 4],    # Select specific columns
15    dtype=float
16)
17
18# Files with mixed data types
19data = np.genfromtxt(
20    'mixed_data.csv',
21    delimiter=',',
22    names=True,           # Use first row as column names
23    dtype=None            # Auto-detect types
24)
25
26# Even simpler for CSV
27data = np.recfromcsv('data.csv')  # Auto-detects delimiter

Relational Databases

Query databases directly with pandas and SQLAlchemy.

Setup Database Connection

1from sqlalchemy import create_engine
2
3# Create connection
4engine = create_engine('sqlite:///my_database.db')
5
6# For SQL Server
7engine = create_engine('sqlite://Northwind.sqlite')
8
9# List all tables
10table_names = engine.table_names()
11print(f"Available tables: {table_names}")

1from sqlalchemy import create_engine
2
3# Create connection
4engine = create_engine('sqlite:///my_database.db')
5
6# For SQL Server
7engine = create_engine('sqlite://Northwind.sqlite')
8
9# List all tables
10table_names = engine.table_names()
11print(f"Available tables: {table_names}")

Querying Databases

1import pandas as pd
2
3# Method 1: Using pandas directly
4df = pd.read_sql_query(
5    "SELECT * FROM Orders WHERE Year = 2024",
6    engine
7)
8
9# Method 2: Using SQLAlchemy connection
10with engine.connect() as con:
11    rs = con.execute("SELECT * FROM Products")
12    df = pd.DataFrame(rs.fetchall())
13    df.columns = rs.keys()
14
15# Fetch in chunks (for large datasets)
16with engine.connect() as con:
17    rs = con.execute("SELECT * FROM LargeTable")
18    df = pd.DataFrame(rs.fetchmany(size=1000))  # Get 1000 rows

1import pandas as pd
2
3# Method 1: Using pandas directly
4df = pd.read_sql_query(
5    "SELECT * FROM Orders WHERE Year = 2024",
6    engine
7)
8
9# Method 2: Using SQLAlchemy connection
10with engine.connect() as con:
11    rs = con.execute("SELECT * FROM Products")
12    df = pd.DataFrame(rs.fetchall())
13    df.columns = rs.keys()
14
15# Fetch in chunks (for large datasets)
16with engine.connect() as con:
17    rs = con.execute("SELECT * FROM LargeTable")
18    df = pd.DataFrame(rs.fetchmany(size=1000))  # Get 1000 rows

Other File Formats

Stata Files (.dta)

1# Read Stata files
2df = pd.read_stata('survey_data.dta')

1# Read Stata files
2df = pd.read_stata('survey_data.dta')

SAS Files (.sas7bdat)

1from sas7bdat import SAS7BDAT
2
3# Using context manager
4with SAS7BDAT('data.sas7bdat') as file:
5    df = file.to_data_frame()
6
7# Direct import
8df = pd.read_sas('data.sas7bdat')

1from sas7bdat import SAS7BDAT
2
3# Using context manager
4with SAS7BDAT('data.sas7bdat') as file:
5    df = file.to_data_frame()
6
7# Direct import
8df = pd.read_sas('data.sas7bdat')

MATLAB Files (.mat)

1import scipy.io
2
3mat = scipy.io.loadmat('workspace.mat')
4
5# Explore structure
6print(mat.keys())     # Dictionary keys
7print(mat.items())    # Key-value pairs
8
9# Access data
10data = mat['variable_name']

1import scipy.io
2
3mat = scipy.io.loadmat('workspace.mat')
4
5# Explore structure
6print(mat.keys())     # Dictionary keys
7print(mat.items())    # Key-value pairs
8
9# Access data
10data = mat['variable_name']

HDF5 Files

1import h5py
2
3data = h5py.File('data.hdf5', 'r')
4
5# Explore structure
6print(data.keys())
7
8# Navigate nested structure
9for key in data['meta'].keys():
10    print(key)
11    # Description, DescriptionURL, Detector, etc.
12
13# Read specific data
14description = data['meta']['Description'].value

1import h5py
2
3data = h5py.File('data.hdf5', 'r')
4
5# Explore structure
6print(data.keys())
7
8# Navigate nested structure
9for key in data['meta'].keys():
10    print(key)
11    # Description, DescriptionURL, Detector, etc.
12
13# Read specific data
14description = data['meta']['Description'].value

Pickled Files (.pkl)

1import pickle
2
3# Load pickled data
4with open('data.pkl', 'rb') as file:
5    pickled_data = pickle.load(file)
6
7# Pickled files are Python-specific but preserve exact Python objects

1import pickle
2
3# Load pickled data
4with open('data.pkl', 'rb') as file:
5    pickled_data = pickle.load(file)
6
7# Pickled files are Python-specific but preserve exact Python objects

Using the Context Manager

Always use with statements for file operations—they automatically handle cleanup.

1# Reading text files
2with open('data.txt', 'r') as file:
3    text = file.read()          # Read entire file
4    first_line = file.readline()  # Read one line
5    lines = file.readlines()      # Read all lines into list
6
7# File automatically closes when done
8print(f"File closed: {file.closed}")  # True
9
10# Writing files
11with open('output.txt', 'w') as file:
12    file.write("Line 1\n")
13    file.write("Line 2\n")

1# Reading text files
2with open('data.txt', 'r') as file:
3    text = file.read()          # Read entire file
4    first_line = file.readline()  # Read one line
5    lines = file.readlines()      # Read all lines into list
6
7# File automatically closes when done
8print(f"File closed: {file.closed}")  # True
9
10# Writing files
11with open('output.txt', 'w') as file:
12    file.write("Line 1\n")
13    file.write("Line 2\n")

Why use context managers?

Automatic cleanup if errors occur
No need to remember file.close()
More Pythonic
Prevents resource leaks

Exploring Your Data

Once imported, always explore before analysis.

Pandas DataFrames

1# Basic exploration
2df.head()          # First 5 rows
3df.tail()          # Last 5 rows
4df.head(10)        # First 10 rows
5
6# Structure
7df.shape           # (rows, columns)
8df.info()          # Data types and missing values
9df.describe()      # Summary statistics
10df.columns         # Column names
11df.index           # Row indices
12
13# Convert to NumPy
14data_array = df.values  # As NumPy array

1# Basic exploration
2df.head()          # First 5 rows
3df.tail()          # Last 5 rows
4df.head(10)        # First 10 rows
5
6# Structure
7df.shape           # (rows, columns)
8df.info()          # Data types and missing values
9df.describe()      # Summary statistics
10df.columns         # Column names
11df.index           # Row indices
12
13# Convert to NumPy
14data_array = df.values  # As NumPy array

NumPy Arrays

1# Array properties
2data_array.dtype   # Data type of elements
3data_array.shape   # Dimensions
4len(data_array)    # Length
5
6# Quick stats
7data_array.min()   # Minimum value
8data_array.max()   # Maximum value
9data_array.mean()  # Mean
10data_array.std()   # Standard deviation

1# Array properties
2data_array.dtype   # Data type of elements
3data_array.shape   # Dimensions
4len(data_array)    # Length
5
6# Quick stats
7data_array.min()   # Minimum value
8data_array.max()   # Maximum value
9data_array.mean()  # Mean
10data_array.std()   # Standard deviation

Magic Commands (Jupyter Notebooks)

If you're working in Jupyter, these commands are gold:

1# File system commands
2!ls                    # List files
3!pwd                   # Current directory
4!cd ..                 # Change directory
5!mkdir new_folder      # Create folder
6
7# File operations
8!head data.csv         # Preview file
9!wc -l data.csv        # Count lines
10!cat file.txt          # Display file contents
11
12# Install packages
13!pip install package_name
14
15# System info
16!python --version

1# File system commands
2!ls                    # List files
3!pwd                   # Current directory
4!cd ..                 # Change directory
5!mkdir new_folder      # Create folder
6
7# File operations
8!head data.csv         # Preview file
9!wc -l data.csv        # Count lines
10!cat file.txt          # Display file contents
11
12# Install packages
13!pip install package_name
14
15# System info
16!python --version

Real-World Examples

Example 1: Importing Sales Data

1import pandas as pd
2
3# Multiple CSV files
4import glob
5
6# Find all CSV files
7csv_files = glob.glob('sales/*.csv')
8
9# Combine into one DataFrame
10dfs = []
11for file in csv_files:
12    df = pd.read_csv(file)
13    dfs.append(df)
14
15combined_df = pd.concat(dfs, ignore_index=True)

1import pandas as pd
2
3# Multiple CSV files
4import glob
5
6# Find all CSV files
7csv_files = glob.glob('sales/*.csv')
8
9# Combine into one DataFrame
10dfs = []
11for file in csv_files:
12    df = pd.read_csv(file)
13    dfs.append(df)
14
15combined_df = pd.concat(dfs, ignore_index=True)

Example 2: Reading from Database

1from sqlalchemy import create_engine
2import pandas as pd
3
4# Connect to database
5engine = create_engine('sqlite:///sales.db')
6
7# Query and import
8query = """
9    SELECT 
10        DATE(OrderDate) as date,
11        SUM(Amount) as daily_sales
12    FROM Orders
13    WHERE YEAR(OrderDate) = 2024
14    GROUP BY DATE(OrderDate)
15"""
16
17sales_df = pd.read_sql_query(query, engine)

1from sqlalchemy import create_engine
2import pandas as pd
3
4# Connect to database
5engine = create_engine('sqlite:///sales.db')
6
7# Query and import
8query = """
9    SELECT 
10        DATE(OrderDate) as date,
11        SUM(Amount) as daily_sales
12    FROM Orders
13    WHERE YEAR(OrderDate) = 2024
14    GROUP BY DATE(OrderDate)
15"""
16
17sales_df = pd.read_sql_query(query, engine)

Example 3: Handling Missing Data During Import

1df = pd.read_csv(
2    'data.csv',
3    na_values=['NA', 'NULL', 'N/A', '', '?', '-999']
4)
5
6# Check what was imported as missing
7print(df.isnull().sum())  # Count missing per column
8print(df.head())
9
10# Fill missing during import
11df = pd.read_csv(
12    'data.csv',
13    na_values=['NA'],
14    keep_default_na=True,
15    fillna=0  # Replace NaN with 0
16)

1df = pd.read_csv(
2    'data.csv',
3    na_values=['NA', 'NULL', 'N/A', '', '?', '-999']
4)
5
6# Check what was imported as missing
7print(df.isnull().sum())  # Count missing per column
8print(df.head())
9
10# Fill missing during import
11df = pd.read_csv(
12    'data.csv',
13    na_values=['NA'],
14    keep_default_na=True,
15    fillna=0  # Replace NaN with 0
16)

Quick Reference

Most Common Import Patterns

1# CSV
2pd.read_csv('file.csv')
3
4# Excel
5pd.read_excel('file.xlsx', sheet_name=0)
6
7# JSON
8pd.read_json('file.json')
9
10# Database
11pd.read_sql_query('SELECT * FROM table', engine)
12
13# NumPy array
14np.loadtxt('file.txt', delimiter=',')
15
16# Text file
17with open('file.txt', 'r') as f:
18    data = f.read()

1# CSV
2pd.read_csv('file.csv')
3
4# Excel
5pd.read_excel('file.xlsx', sheet_name=0)
6
7# JSON
8pd.read_json('file.json')
9
10# Database
11pd.read_sql_query('SELECT * FROM table', engine)
12
13# NumPy array
14np.loadtxt('file.txt', delimiter=',')
15
16# Text file
17with open('file.txt', 'r') as f:
18    data = f.read()

Essential Parameters

These work for both CSV and Excel:

1sep=','           # Delimiter/separator
2header=0          # Row to use as column names
3skiprows=[0, 1]   # Rows to skip
4nrows=1000        # Number of rows to read
5usecols=[0, 2, 5] # Column indices to read
6index_col=0       # Column to use as index
7na_values=['NA']   # Values to treat as missing
8encoding='utf-8'   # File encoding

1sep=','           # Delimiter/separator
2header=0          # Row to use as column names
3skiprows=[0, 1]   # Rows to skip
4nrows=1000        # Number of rows to read
5usecols=[0, 2, 5] # Column indices to read
6index_col=0       # Column to use as index
7na_values=['NA']   # Values to treat as missing
8encoding='utf-8'   # File encoding

Excel-specific:

1sheet_name=0       # Sheet number or name
2parse_dates=True   # Parse date columns

1sheet_name=0       # Sheet number or name
2parse_dates=True   # Parse date columns

Putting it all together:

1df = pd.read_csv(
2    'data.csv',
3    sep=',',           # Comma delimiter
4    header=0,          # First row has headers
5    nrows=10000,       # Read 10k rows
6    usecols=[0, 1, 3], # Select specific columns
7    na_values=['N/A'], # Treat 'N/A' as missing
8    encoding='utf-8'   # Handle special chars
9)

1df = pd.read_csv(
2    'data.csv',
3    sep=',',           # Comma delimiter
4    header=0,          # First row has headers
5    nrows=10000,       # Read 10k rows
6    usecols=[0, 1, 3], # Select specific columns
7    na_values=['N/A'], # Treat 'N/A' as missing
8    encoding='utf-8'   # Handle special chars
9)

Pro Tips

✅ Always specify encoding for international data: encoding='utf-8'
✅ Use low_memory=False for large CSV files to avoid type inference issues
✅ Set dtype explicitly when you know the expected types
✅ Read in chunks for very large files: pd.read_csv(file, chunksize=10000)
✅ Context managers are your friend—always use with statements
✅ Check data types immediately after import with df.dtypes
✅ Handle encoding errors gracefully: errors='ignore'