mi-task/test/text-image/text_image.py

239 lines
7.9 KiB
Python
Raw Normal View History

2025-08-21 12:51:13 +08:00
import cv2
import numpy as np
import os
from pathlib import Path
def detect_white_paper(image):
"""
检测图像中的白色纸张区域
"""
# 转换为HSV色彩空间
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
# 更严格的白色检测阈值
lower_white = np.array([0, 0, 230]) # 高亮度,低饱和度
upper_white = np.array([180, 20, 255])
# 创建白色掩码
white_mask = cv2.inRange(hsv, lower_white, upper_white)
# 形态学操作:去除噪点
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
white_mask = cv2.morphologyEx(white_mask, cv2.MORPH_OPEN, kernel)
# 查找轮廓
contours, _ = cv2.findContours(white_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# 筛选合适的白色区域(基于面积和形状)
paper_regions = []
for contour in contours:
area = cv2.contourArea(contour)
if area > 5000: # 过滤太小的区域
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = w / h
# 纸张通常是横向的
if 1.0 < aspect_ratio < 3.0:
paper_regions.append((x, y, w, h))
return paper_regions, white_mask
def find_text_regions(roi):
"""
在ROI中查找文字区域
"""
# 转换为灰度图
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
# 自适应二值化,更好地处理不同光照条件
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 11, 2)
# 形态学操作:连接文字笔画
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
# 查找轮廓
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# 筛选文字轮廓
text_regions = []
for contour in contours:
area = cv2.contourArea(contour)
if 200 < area < 10000: # 合理的文字面积范围
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = w / h
# 文字通常有合理的宽高比
if 0.3 < aspect_ratio < 4.0:
text_regions.append((x, y, w, h))
return text_regions, binary
def classify_digit(roi, region):
"""
分类数字1或2
"""
x, y, w, h = region
# 提取字符ROI
char_roi = roi[y:y+h, x:x+w]
# 转换为灰度并二值化
gray_char = cv2.cvtColor(char_roi, cv2.COLOR_BGR2GRAY)
_, binary_char = cv2.threshold(gray_char, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# 调整大小到统一尺寸
char_resized = cv2.resize(binary_char, (20, 20))
# 计算特征
total_pixels = char_resized.shape[0] * char_resized.shape[1]
black_pixels = np.sum(char_resized == 255)
density = black_pixels / total_pixels
# 计算水平和垂直投影
horizontal_proj = np.sum(char_resized == 255, axis=1)
vertical_proj = np.sum(char_resized == 255, axis=0)
# 计算投影的方差
h_variance = np.var(horizontal_proj)
v_variance = np.var(vertical_proj)
# 计算中心区域密度
center_region = char_resized[6:14, 6:14]
center_density = np.sum(center_region == 255) / (8 * 8)
# 基于特征分类
if density < 0.2:
return "unknown"
# 数字1的特征垂直投影方差小中心密度低
if v_variance < 8 and center_density < 0.4 and density < 0.5:
return "1"
# 数字2的特征垂直投影方差大中心密度高
if v_variance > 12 and center_density > 0.3 and density > 0.4:
return "2"
return "unknown"
def find_a_and_digit(image_path):
"""
查找A和右侧的数字
"""
# 读取图像
image = cv2.imread(image_path)
if image is None:
print(f"无法读取图像: {image_path}")
return None, []
# 检测白色纸张区域
paper_regions, white_mask = detect_white_paper(image)
if not paper_regions:
print("未检测到白色纸张区域")
return image, []
print(f"检测到 {len(paper_regions)} 个白色纸张区域")
all_results = []
# 处理每个纸张区域
for i, paper_region in enumerate(paper_regions):
x, y, w, h = paper_region
print(f"处理纸张区域 {i+1}: 位置({x}, {y}), 大小({w}x{h})")
# 提取纸张ROI
paper_roi = image[y:y+h, x:x+w]
# 查找文字区域
text_regions, binary = find_text_regions(paper_roi)
print(f" 在区域 {i+1} 中找到 {len(text_regions)} 个文字区域")
# 按x坐标排序文字区域从左到右
text_regions.sort(key=lambda r: r[0])
# 查找A和右侧的数字
a_found = False
digit_found = False
for j, text_region in enumerate(text_regions):
tx, ty, tw, th = text_region
# 简单判断如果区域较宽可能是A如果较窄可能是数字
if tw > th * 1.5 and not a_found: # 可能是A
a_found = True
result = {
'region_id': i + 1,
'char_id': j + 1,
'char_type': 'A',
'position': (x + tx, y + ty, tw, th),
'relative_position': (tx, ty, tw, th)
}
all_results.append(result)
print(f" 找到A: 位置({tx}, {ty}), 大小({tw}x{th})")
elif tw <= th * 1.5 and a_found and not digit_found: # 可能是数字
# 分类数字
digit_type = classify_digit(paper_roi, text_region)
if digit_type in ['1', '2']:
digit_found = True
result = {
'region_id': i + 1,
'char_id': j + 1,
'char_type': digit_type,
'position': (x + tx, y + ty, tw, th),
'relative_position': (tx, ty, tw, th)
}
all_results.append(result)
print(f" 找到数字{digit_type}: 位置({tx}, {ty}), 大小({tw}x{th})")
break
# 在纸张周围绘制边框
cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 0), 3)
cv2.putText(image, f"Paper {i+1}", (x, y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)
# 在识别结果周围绘制边框
for result in all_results:
rx, ry, rw, rh = result['position']
color = (0, 255, 0) if result['char_type'] != "unknown" else (0, 0, 255)
cv2.rectangle(image, (rx, ry), (rx + rw, ry + rh), color, 2)
cv2.putText(image, result['char_type'], (rx, ry - 5),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
return image, all_results
def main():
"""
主函数测试A和数字识别
"""
# 图片路径
img_dir = Path("imgs")
img_path = img_dir / "a1.jpg"
print("开始识别图片中的A和右侧数字...")
# 识别图片
if img_path.exists():
print(f"\n识别图片: {img_path}")
result_img, results = find_a_and_digit(str(img_path))
if results:
print("\n识别结果:")
for result in results:
print(f" 区域{result['region_id']}-字符{result['char_id']}: {result['char_type']}, "
f"位置: {result['position']}")
else:
print("未识别到A和数字")
# 保存结果图片
output_path = img_dir / "result_a_digit.jpg"
cv2.imwrite(str(output_path), result_img)
print(f"\n结果图片已保存到: {output_path}")
print("\n识别完成!")
if __name__ == "__main__":
main()