Most real-world data science starts with importing data. Whether it’s CSV files, Excel spreadsheets, databases, or JSON from APIs, you need to know how to get data into Python efficiently.
The Working Directory
Python needs to know where to find your files. The working directory is crucial.
import os
# Get current working directorywd = os.getcwd()print(f"Current directory: {wd}")
# List files in current directoryos.listdir(wd)
# Change working directoryos.chdir("/path/to/new/directory")
# Useful operationsos.rename("old_file.txt", "new_file.txt") # Rename fileos.remove("unwanted_file.txt") # Delete fileos.mkdir("new_folder") # Create folderImporting with Pandas
Pandas is your Swiss Army knife for data imports. Learn these well.
CSV Files
The most common format in data science.
import pandas as pd
# Basic CSV importdf = pd.read_csv('data.csv')
# With custom parametersdf = pd.read_csv( 'data.csv', nrows=1000, # Read first 1000 rows header=0, # Row 0 as column names sep=',', # Comma delimiter index_col=0, # First column as index na_values=['NA', 'N/A', ''], # Recognize as missing dtype={'column_name': str} # Specify data types)
# Common optionsdf = pd.read_csv( 'messy_data.csv', skiprows=[0, 1], # Skip header rows usecols=[0, 2, 5], # Select specific columns encoding='utf-8', # Handle special characters low_memory=False # For large files)Excel Files
Handling spreadsheets with multiple sheets.
# Read Excel filefile = 'sales_data.xlsx'excel_data = pd.ExcelFile(file)
# Get sheet namesprint(excel_data.sheet_names)# ['Sheet1', 'Sheet2', 'Sheet3']
# Read specific sheetdf = pd.read_excel(file, sheet_name=0) # First sheetdf = pd.read_excel(file, sheet_name='Sales') # By name
# Read with custom optionsdf = pd.read_excel( file, sheet_name='Sales', skiprows=1, names=['Date', 'Amount', 'Category'], parse_dates=['Date'])
# Read all sheets into dictionaryall_sheets = pd.read_excel(file, sheet_name=None)JSON Files
Work with JSON data from APIs and config files.
# From JSON filedf = pd.read_json('data.json')
# From JSON stringimport json
json_string = '{"name": "Alice", "age": 25}'data = json.loads(json_string)df = pd.DataFrame([data])
# Nested JSONwith open('nested_data.json', 'r') as f: data = json.load(f) df = pd.json_normalize(data) # Flatten nested structureImporting with NumPy
NumPy is faster for numerical data but less flexible than pandas.
Flat Files with NumPy
import numpy as np
# Simple text filedata = np.loadtxt('data.txt')
# With custom delimiterdata = np.loadtxt('comma_separated.txt', delimiter=',')
# Skip rows and select columnsdata = np.loadtxt( 'data.txt', delimiter=',', skiprows=2, # Skip header rows usecols=[0, 2, 4], # Select specific columns dtype=float)
# Files with mixed data typesdata = np.genfromtxt( 'mixed_data.csv', delimiter=',', names=True, # Use first row as column names dtype=None # Auto-detect types)
# Even simpler for CSVdata = np.recfromcsv('data.csv') # Auto-detects delimiterRelational Databases
Query databases directly with pandas and SQLAlchemy.
Setup Database Connection
from sqlalchemy import create_engine
# Create connectionengine = create_engine('sqlite:///my_database.db')
# For SQL Serverengine = create_engine('sqlite://Northwind.sqlite')
# List all tablestable_names = engine.table_names()print(f"Available tables: {table_names}")Querying Databases
import pandas as pd
# Method 1: Using pandas directlydf = pd.read_sql_query( "SELECT * FROM Orders WHERE Year = 2024", engine)
# Method 2: Using SQLAlchemy connectionwith engine.connect() as con: rs = con.execute("SELECT * FROM Products") df = pd.DataFrame(rs.fetchall()) df.columns = rs.keys()
# Fetch in chunks (for large datasets)with engine.connect() as con: rs = con.execute("SELECT * FROM LargeTable") df = pd.DataFrame(rs.fetchmany(size=1000)) # Get 1000 rowsOther File Formats
Stata Files (.dta)
# Read Stata filesdf = pd.read_stata('survey_data.dta')SAS Files (.sas7bdat)
from sas7bdat import SAS7BDAT
# Using context managerwith SAS7BDAT('data.sas7bdat') as file: df = file.to_data_frame()
# Direct importdf = pd.read_sas('data.sas7bdat')MATLAB Files (.mat)
import scipy.io
mat = scipy.io.loadmat('workspace.mat')
# Explore structureprint(mat.keys()) # Dictionary keysprint(mat.items()) # Key-value pairs
# Access datadata = mat['variable_name']HDF5 Files
import h5py
data = h5py.File('data.hdf5', 'r')
# Explore structureprint(data.keys())
# Navigate nested structurefor key in data['meta'].keys(): print(key) # Description, DescriptionURL, Detector, etc.
# Read specific datadescription = data['meta']['Description'].valuePickled Files (.pkl)
import pickle
# Load pickled datawith open('data.pkl', 'rb') as file: pickled_data = pickle.load(file)
# Pickled files are Python-specific but preserve exact Python objectsUsing the Context Manager
Always use with statements for file operations—they automatically handle cleanup.
# Reading text fileswith open('data.txt', 'r') as file: text = file.read() # Read entire file first_line = file.readline() # Read one line lines = file.readlines() # Read all lines into list
# File automatically closes when doneprint(f"File closed: {file.closed}") # True
# Writing fileswith open('output.txt', 'w') as file: file.write("Line 1\n") file.write("Line 2\n")Why use context managers?
- Automatic cleanup if errors occur
- No need to remember
file.close() - More Pythonic
- Prevents resource leaks
Exploring Your Data
Once imported, always explore before analysis.
Pandas DataFrames
# Basic explorationdf.head() # First 5 rowsdf.tail() # Last 5 rowsdf.head(10) # First 10 rows
# Structuredf.shape # (rows, columns)df.info() # Data types and missing valuesdf.describe() # Summary statisticsdf.columns # Column namesdf.index # Row indices
# Convert to NumPydata_array = df.values # As NumPy arrayNumPy Arrays
# Array propertiesdata_array.dtype # Data type of elementsdata_array.shape # Dimensionslen(data_array) # Length
# Quick statsdata_array.min() # Minimum valuedata_array.max() # Maximum valuedata_array.mean() # Meandata_array.std() # Standard deviationMagic Commands (Jupyter Notebooks)
If you’re working in Jupyter, these commands are gold:
# File system commands!ls # List files!pwd # Current directory!cd .. # Change directory!mkdir new_folder # Create folder
# File operations!head data.csv # Preview file!wc -l data.csv # Count lines!cat file.txt # Display file contents
# Install packages!pip install package_name
# System info!python --versionReal-World Examples
Example 1: Importing Sales Data
import pandas as pd
# Multiple CSV filesimport glob
# Find all CSV filescsv_files = glob.glob('sales/*.csv')
# Combine into one DataFramedfs = []for file in csv_files: df = pd.read_csv(file) dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)Example 2: Reading from Database
from sqlalchemy import create_engineimport pandas as pd
# Connect to databaseengine = create_engine('sqlite:///sales.db')
# Query and importquery = """ SELECT DATE(OrderDate) as date, SUM(Amount) as daily_sales FROM Orders WHERE YEAR(OrderDate) = 2024 GROUP BY DATE(OrderDate)"""
sales_df = pd.read_sql_query(query, engine)Example 3: Handling Missing Data During Import
df = pd.read_csv( 'data.csv', na_values=['NA', 'NULL', 'N/A', '', '?', '-999'])
# Check what was imported as missingprint(df.isnull().sum()) # Count missing per columnprint(df.head())
# Fill missing during importdf = pd.read_csv( 'data.csv', na_values=['NA'], keep_default_na=True, fillna=0 # Replace NaN with 0)Quick Reference
Most Common Import Patterns
# CSVpd.read_csv('file.csv')
# Excelpd.read_excel('file.xlsx', sheet_name=0)
# JSONpd.read_json('file.json')
# Databasepd.read_sql_query('SELECT * FROM table', engine)
# NumPy arraynp.loadtxt('file.txt', delimiter=',')
# Text filewith open('file.txt', 'r') as f: data = f.read()Essential Parameters
These work for both CSV and Excel:
sep=',' # Delimiter/separatorheader=0 # Row to use as column namesskiprows=[0, 1] # Rows to skipnrows=1000 # Number of rows to readusecols=[0, 2, 5] # Column indices to readindex_col=0 # Column to use as indexna_values=['NA'] # Values to treat as missingencoding='utf-8' # File encodingExcel-specific:
sheet_name=0 # Sheet number or nameparse_dates=True # Parse date columnsPutting it all together:
df = pd.read_csv( 'data.csv', sep=',', # Comma delimiter header=0, # First row has headers nrows=10000, # Read 10k rows usecols=[0, 1, 3], # Select specific columns na_values=['N/A'], # Treat 'N/A' as missing encoding='utf-8' # Handle special chars)Pro Tips
✅ Always specify encoding for international data: encoding='utf-8'
✅ Use low_memory=False for large CSV files to avoid type inference issues
✅ Set dtype explicitly when you know the expected types
✅ Read in chunks for very large files: pd.read_csv(file, chunksize=10000)
✅ Context managers are your friend—always use with statements
✅ Check data types immediately after import with df.dtypes
✅ Handle encoding errors gracefully: errors='ignore'