post date: 2024-10-15 00:00:00 +0000
SIFT Features: Unveiling the Magic of Scale-Invariant Feature Transform
Introduction
Scale-Invariant Feature Transform (SIFT) is an awesome algorithm in computer vision for detecting and describing local features in images. SIFT features are great for their ability to remain consistent across changes in scale, rotation, and illumination.
High Level overview
SIFT involves four key stages:
- Scale-Space Extrema Detection
- Keypoint Localization
- Orientation Assignment
- Keypoint Descriptor Generation
Methodology
1. Scale-Space Representation
The core idea is to create a scale-space pyramid where the image is progressively blurred and downsampled:
L(x,y,σ) = G(x,y,σ) * I(x,y)
Where:
L
is the blurred imageG
is the Gaussian kernelI
is the original imageσ
is the scale parameter
2. Difference of Gaussian (DoG)
D(x,y,σ) = L(x,y,kσ) - L(x,y,σ)
This operation helps identify potential keypoint locations efficiently.
Python Implementation
import numpy as np
import cv2
import matplotlib.pyplot as plt
class SIFTDetector:
def __init__(self, octaves=4, scales=5, threshold=0.04):
self.octaves = octaves
self.scales = scales
self.threshold = threshold
def _generate_scale_space(self, image):
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY).astype(np.float32)
scale_pyramid = []
dog_pyramid = []
sigma = 1.6
k = 2 ** (1 / (self.scales - 1))
for _ in range(self.octaves):
octave_scales = []
octave_dogs = []
for s in range(self.scales):
current_sigma = sigma * (k ** s)
blurred = cv2.GaussianBlur(gray, (0, 0), current_sigma)
octave_scales.append(blurred)
if s > 0:
dog = octave_scales[s] - octave_scales[s-1]
octave_dogs.append(dog)
scale_pyramid.append(octave_scales)
dog_pyramid.append(octave_dogs)
gray = cv2.resize(gray, (gray.shape[1] // 2, gray.shape[0] // 2))
return scale_pyramid, dog_pyramid
def detect_keypoints(self, image):
scale_pyramid, dog_pyramid = self._generate_scale_space(image)
keypoints = []
for octave in range(self.octaves):
for scale in range(1, len(dog_pyramid[octave])-1):
current_dog = dog_pyramid[octave][scale]
for y in range(1, current_dog.shape[0]-1):
for x in range(1, current_dog.shape[1]-1):
neighborhood = dog_pyramid[octave][scale-1:scale+2][:, y-1:y+2, x-1:x+2]
current_pixel = dog_pyramid[octave][scale][y, x]
is_extreme = (
(current_pixel > np.max(neighborhood)) or
(current_pixel < np.min(neighborhood))
)
if is_extreme and abs(current_pixel) > self.threshold:
scaled_x = x * (2 ** octave)
scaled_y = y * (2 ** octave)
keypoint = cv2.KeyPoint(
scaled_x, scaled_y,
size=1.6 * (2 ** octave),
response=abs(current_pixel)
)
keypoints.append(keypoint)
return keypoints
def compute_descriptors(self, image, keypoints):
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, descriptors = cv2.SIFT_create().compute(gray, keypoints)
return descriptors
def visualize(self, image, keypoints=None):
plt.figure(figsize=(12, 5))
plt.subplot(121)
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.title('Original Image')
plt.axis('off')
if keypoints is not None:
plt.subplot(122)
keypoint_image = cv2.drawKeypoints(
image, keypoints, None,
flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS
)
plt.imshow(cv2.cvtColor(keypoint_image, cv2.COLOR_BGR2RGB))
plt.title('SIFT Keypoints')
plt.axis('off')
plt.tight_layout()
plt.show()
def match_features(desc1, desc2, ratio_threshold=0.75):
bf = cv2.BFMatcher()
matches = bf.knnMatch(desc1, desc2, k=2)
good_matches = []
for m, n in matches:
if m.distance < ratio_threshold * n.distance:
good_matches.append(m)
return good_matches
def main():
# Load reference images
image1 = cv2.imread('image1.jpg')
image2 = cv2.imread('image2.jpg')
# Initialize SIFT detector
sift = SIFTDetector()
# Detect keypoints
kp1 = sift.detect_keypoints(image1)
kp2 = sift.detect_keypoints(image2)
# Compute descriptors
desc1 = sift.compute_descriptors(image1, kp1)
desc2 = sift.compute_descriptors(image2, kp2)
# Match features
matches = match_features(desc1, desc2)
# Visualization
sift.visualize(image1, kp1)
sift.visualize(image2, kp2)
# Draw matches
match_img = cv2.drawMatches(
image1, kp1,
image2, kp2,
matches[:10],
None,
flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS
)
plt.figure(figsize=(15, 5))
plt.imshow(cv2.cvtColor(match_img, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()
if __name__ == '__main__':
main()
Keypoint Descriptor Generation
After detecting keypoints, SIFT generates a 128-dimensional descriptor:
- Create a 4x4 grid around the keypoint
- Compute gradient orientations
- Create a histogram of orientations for each sub-region
- Concatenate histograms into a single feature vector
Other Awesome Alternatives
- SURF (Speeded Up Robust Features)
- ORB (Oriented FAST and Rotated BRIEF)
- Deep learning-based feature descriptors
Conclusion
SIFT represents a milestone in computer vision, providing a robust method for feature detection that remains invariant to scale, rotation, and partial illumination changes.