GIS

GIS库JTS例子

JTS库依赖

1
2
3
4
5
<dependency>
    <groupId>com.vividsolutions</groupId>
    <artifactId>jts</artifactId>
    <version>1.13</version>
</dependency>

判断点是否在多边形内

1
2
3
4
5
6
7
8
9
GeometryFactory factory = new GeometryFactory();
// 定义多边形
LinearRing shell = factory.createLinearRing(cs);
Polygon polygon = factory.createPolygon(shell);
// Polygon polygon = factory.createPolygon(shell, holes);// holes 是 LinearRing 的数组
// 定义点
Coordinate coordinate = new Coordinate(lon, lat);
// 判断是否在多边形内
boolean contains = SimplePointInAreaLocator.containsPointInPolygon(coordinate, polygon);

抽稀算法(道格拉斯-普克算法 Douglas–Peucker algorithm)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
 
double lon1 = 116.556408D;
double lat1 = 34.012041D;
Coordinate[] coordinates = new Coordinate[]{
  new Coordinate(lon1, lat1),
  ...
};
CoordinateSequence coordinateSequence = new CoordinateArraySequence(coordinates);
//数据精度
PrecisionModel precisionModel = new PrecisionModel(PrecisionModel.FLOATING_SINGLE);
GeometryFactory factory = new GeometryFactory(precisionModel, 4326);
LineString lineString = new LineString(coordinateSequence, factory);
 
DouglasPeuckerSimplifier simplifier = new DouglasPeuckerSimplifier(lineString);
// 抽稀释容差,比如抽稀距离20m转换成=20/110km=20/110000m
simplifier.setDistanceTolerance(0.0001D);
Geometry geometry = simplifier.getResultGeometry();
// 抽稀后的结果
Coordinate[] result = geometry.getCoordinates();
音视频

海康威视RTSP流地址

RTSP流格式

rtsp://[username]:[password]@[ip]:[port]/[codec]/[channel]/[subtype]/av_stream

说明:

参数 说明 示例
username 用户名 如admin
password 密码 123456
ip 设备IP 192.168.1.1
port 端口号,默认为554,不填写默认 554
codec 编码 h264,MPEG-4,mpeg4等
channel 通道号,起始为1,通道1则为ch1 ch1
subtype 码流类型,主码流为main,辅码流为sub main

推流

1
2
3
# ffmpeg -i "rtsp://username:passport@192.168.1.1:554/h264/ch1/sub/av_stream" -vcodec copy -preset:v ultrafast -tune:v zerolatency -acodec copy -f flv  -an "rtmp://192.168.1.1/live/haikang_01"
ffmpeg version 4.2.1 Copyright (c) 2000-2019 the FFmpeg developers
  built with Apple clang version 11.0.0 (clang-1100.0.33.8)
音视频

流媒体服务器nginx-rtmp安装

下载源代码

1
2
3
# git clone git@github.com:arut/nginx-rtmp-module.git
git clone git@github.com:winshining/nginx-http-flv-module.git
axel -n 100 http://nginx.org/download/nginx-1.17.5.tar.gz

安装依赖

1
2
3
4
5
6
7
8
9
## ubintu
sudo apt-get install openssl libssl-dev
sudo apt-get install libpcre3 libpcre3-dev
sudo apt-get install zlib1g-dev
 
## centos
sudo yum install -y pcre pcre-devel
sudo yum install -y openssl openssl-devel
sudo yum install -y zlib-devel zlib

编译安装

1
2
3
tar zxvf nginx-1.17.5.tar.gz
cd nginx-1.17.5/
 ./configure --prefix=/usr/local/nginx --add-module=/home/work/nginx-http-flv-module --with-http_ssl_module --with-debug

配置nginx用户

1
2
3
sudo useradd nginx
## sudo vim /etc/passwd
## nginx:x:1001:1001:,,,:/home/nginx:/usr/sbin/nologin

创建相关用户

1
2
3
4
5
6
7
8
# 创建相关目录并修改所有者
sudo mkdir -p /usr/local/nginx/data/dash/live
sudo mkdir -p /usr/local/nginx/data/hls/live
sudo mkdir -p /usr/local/nginx/stat
sudo cp /home/work/nginx-http-flv-module/stat.xsl /usr/local/nginx/stat/
 
sudo chown -R nginx /usr/local/nginx/data
sudo chown -R nginx /usr/local/nginx/stat

修改配置文件

见附录:示例

启动nginx服务器

1
2
3
 .
 
sudo /usr/local/nginx/sbin/nginx

测试

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
.
 
# 推流
ffmpeg -re -i /Applications/ambari-vagrant/ubuntu18.4/data/fulankelin-hd.mp4 -c copy -f flv rtmp://u1802/live/fulankelin-hd
 
# 支持播放地址
rtmp://u1802/live/fulankelin-hd
 
http://u1802/live?app=live&stream=fulankelin-hd
http://u1802/live?port=1935&app=live&stream=fulankelin-hd
http://u1802/live/fulankelin-hd.flv
http://u1802/live/fulankelin-hd.flv?port=1935
 
http://u1802/live/fulankelin-hd.mpd
http://u1802/live/fulankelin-hd.m3u8
 
.

配置示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
user  nginx;
worker_processes  1;
 
 
# error_log  logs/error.log;
# error_log  logs/error.log  notice;
error_log  logs/error.log  debug;
 
pid        logs/nginx.pid;
 
 
events {
    worker_connections  4096;
}
 
http {
    include       mime.types;
    default_type  application/octet-stream;
 
    log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
                      '$status $body_bytes_sent "$http_referer" '
                      '"$http_user_agent" "$http_x_forwarded_for"';
 
    access_log  logs/access.log  main;
 
    sendfile        on;
    #tcp_nopush     on;
 
    #keepalive_timeout  0;
    keepalive_timeout  65;
 
    #gzip  on;
 
    server {
        listen       80;
        server_name  localhost;
 
        #charset koi8-r;
 
        #access_log  logs/host.access.log  main;
 
        location / {
            root   html;
            index  index.html index.htm;
        }
 
        #error_page  404              /404.html;
 
        # redirect server error pages to the static page /50x.html
        #
        error_page   500 502 503 504  /50x.html;
        location = /50x.html {
            root   html;
        }
 
        # location ~* \.(m3u8)$ {
        #     types {
        #         application/vnd.apple.mpegurl m3u8;
        #         video/mp2t ts;
        #     }
 
        #     root /usr/local/nginx/data;
        #     add_header 'Cache-Control' 'no-cache';
        # }
        location /live {
            flv_live on; # 打开http播放flv直播流的方式
            chunked_transfer_encoding on; # 支持Transfer-Encoding: chunked方式回复
 
            add_header 'Access-Control-Allow-Origin' '*';
            add_header 'Access-Control-Allow-Credentials' 'true';
        }
 
        location ~ \.(mpd|m4a|m4v)$ {
            root /usr/local/nginx/data/dash/;
            add_header 'Cache-Control' 'no-cache';
        }
        # }
        location ~ \.(m3u8|ts)$ {
            types {
                application/vnd.apple.mpegurl m3u8;
                video/mp2t ts;
            }
 
            root /usr/local/nginx/data/hls/;
            add_header 'Cache-Control' 'no-cache';
        }
 
        location ~ \.(flv)$ {
            rewrite ^/(.*)/(.*)\.(flv)$ /$1?app=$1&stream=$2 last;
        }
 
        location /stat {
            rtmp_stat all;
            rtmp_stat_stylesheet stat.xsl;
        }
        location /stat.xsl {
            root /usr/local/nginx/stat/;
        }
 
        # proxy the PHP scripts to Apache listening on 127.0.0.1:80
        #
        #location ~ \.php$ {
        #    proxy_pass   http://127.0.0.1;
        #}
 
        # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
        #
        #location ~ \.php$ {
        #    root           html;
        #    fastcgi_pass   127.0.0.1:9000;
        #    fastcgi_index  index.php;
        #    fastcgi_param  SCRIPT_FILENAME  /scripts$fastcgi_script_name;
        #    include        fastcgi_params;
        #}
 
        # deny access to .htaccess files, if Apache's document root
        # concurs with nginx's one
        #
        #location ~ /\.ht {
        #    deny  all;
        #}
    }
 
 
    # another virtual host using mix of IP-, name-, and port-based configuration
    #
    #server {
    #    listen       8000;
    #    listen       somename:8080;
    #    server_name  somename  alias  another.alias;
 
    #    location / {
    #        root   html;
    #        index  index.html index.htm;
    #    }
    #}
 
 
    # HTTPS server
    #
    #server {
    #    listen       443 ssl;
    #    server_name  localhost;
 
    #    ssl_certificate      cert.pem;
    #    ssl_certificate_key  cert.key;
 
    #    ssl_session_cache    shared:SSL:1m;
    #    ssl_session_timeout  5m;
 
    #    ssl_ciphers  HIGH:!aNULL:!MD5;
    #    ssl_prefer_server_ciphers  on;
 
    #    location / {
    #        root   html;
    #        index  index.html index.htm;
    #    }
    #}
 
}
 
rtmp_auto_push on;
rtmp_auto_push_reconnect 1s;
rtmp_socket_dir /tmp;
 
rtmp {
    out_queue           4096;
    out_cork            8;
    max_streams         128;
    timeout             15s;
    drop_idle_publisher 15s;
 
    log_interval 5s; #log模块在access.log中记录日志的间隔时间,对调试非常有用
    log_size     1m; #log模块用来记录日志的缓冲区大小
 
    server {
        listen 1935;
        on_connect http://127.0.0.1:3000/on_connect;
 
        application live {
            live on;
            hls on;
            hls_path /usr/local/nginx/data/hls/live;
            dash on;
            dash_path /usr/local/nginx/data/dash/live;
            gop_cache on; #打开GOP缓存,减少首屏等待时间
 
            notify_update_timeout 30s;
            notify_relay_redirect off; # 启用本地流重定向on_play和on_publish远程重定向。新的流名称是用于远程重定向的RTMP URL的MD5哈希。默认为关闭。
            notify_update_strict off; # 切换on_update回调的严格模式。默认为关闭。打开所有连接错误后,超时以及HTTP解析错误和空响应均被视为更新失败并导致连接终止。
            notify_method get;
 
            on_play http://127.0.0.1:3000/on_play;
            on_publish http://127.0.0.1:3000/on_publish;
            on_done http://127.0.0.1:3000/on_done;
            on_play_done http://127.0.0.1:3000/on_play_done;
            on_publish_done http://127.0.0.1:3000/on_publish_done;
            on_record_done http://127.0.0.1:3000/on_record_done;
            on_update http://127.0.0.1:3000/on_update;
 
        }
    }
}

问题处理

1
2
3
4
5
6
7
8
9
10
11
启动报错
nginx: [warn] 4096 worker_connections exceed open file resource limit: 1024
 
# ulimit -n 65535
 
vim /etc/security/limits.conf
* soft nofile 65535
* hard nofile 65535
 
vim /etc/sysctl.conf 
fs.file-max = 6553560
未分类

DataNode

基于社区2.7.2分支源码分析

一、分析结果

可优化点及对应社区Path

  1. DataNode启动时,初始化FsDatasetImpl锁粒度调整(HDFS-10682)
    • https://issues.apache.org/jira/browse/HDFS-10682
    • Replace FsDatasetImpl object lock with a separate lock object

      This Jira proposes to replace the FsDatasetImpl object lock with a separate lock object. Doing so will make it easier to measure lock statistics like lock held time and warn about potential lock contention due to slow disk operations.
      Right now we can use org.apache.hadoop.util.AutoCloseableLock. In the future we can also consider replacing the lock with a read-write lock.

  2. DataNode启动时,checkDiskError()检查并踢除坏掉的路径的地方可以并行检查并且不需要锁(HDFS-11086 )
    • https://issues.apache.org/jira/browse/HDFS-11086
    • DataNode disk check improvements

      This Jira tracks a few improvements to DataNode’s usage of DiskChecker to address the following problems:

      • Checks are serialized so a single slow disk can indefinitely delay checking the rest.
      • Related to 1, no detection of stalled checks.
      • Lack of granularity. A single IO error initiates checking all disks.
      • Inconsistent activation. Some DataNode IO failures trigger disk checks but not all.
  3. DataNode可优化内存占用来提高性能(HDFS-9260 HDFS-8859)
  4. 移除多余的Replica复制写操作
  5. DataNode ReplicaMap 锁使用(HDFS-10828 )
    • https://issues.apache.org/jira/browse/HDFS-10828
    • Fix usage of FsDatasetImpl object lock in ReplicaMap

      HDFS-10682 replaced the FsDatasetImpl object lock with a separate reentrant lock but missed updating an instance ReplicaMap still uses the FsDatasetImpl.

  6. DataNode启动时getAllVolumesMap可以用文件缓存初始化,需要在DataNode关闭时把VolumesMap序列化到文件(HDFS-7928 )
    • https://issues.apache.org/jira/browse/HDFS-7928
    • Scanning blocks from disk during rolling upgrade startup takes a lot of time if disks are busy

      We observed this issue in rolling upgrade to 2.6.x on one of our cluster.
      One of the disks was very busy and it took long time to scan that disk compared to other disks.
      Seeing the sar (System Activity Reporter) data we saw that the particular disk was very busy performing IO operations.
      Requesting for an improvement during datanode rolling upgrade.
      During shutdown, we can persist the whole volume map on the disk and let the datanode read that file and create the volume map during startup after rolling upgrade.
      This will not require the datanode process to scan all the disk and read the block.
      This will significantly improve the datanode startup time.

  7. 通过NameNode的heartbeat消息进行DataNodes的Full BLock Reports流控( HDFS-7923 )
    • https://issues.apache.org/jira/browse/HDFS-7923
    • The DataNodes should rate-limit their full block reports by asking the NN on heartbeat messages (cmccabe)

      The DataNodes should rate-limit their full block reports. They can do this by first sending a heartbeat message to the NN with an optional boolean set which requests permission to send a full block report. If the NN responds with another optional boolean set, the DN will send an FBR… if not, it will wait until later. This can be done compatibly with optional fields.

实现

HDFS-7923

DataNode在实现了sendHeartBeat和brockReport(BPServiceActor.offerService),DataNode周期执行offerService中的循环任务,当检测到(fullBlockReportLeaseId = 0)并且到下次report的时间(nextBlockReportTime – curTime <= 0)【注:第一心跳时fullBlockReportLeaseId=0,之后每发送完report后都会重置它为0,nextBlockReportTime默认值为当前时间,每次执行完report后会设置它的值为下次执行report的时间点】就会把获取brock report lease的参数添加到心跳请求(HeartbeatRequestProto.requestFullBlockReportLease)中,之后NN(NameNodeRpcServer、BlockManager、BlockReportLeaseManager)会检查是否可以给DN分配lease,如果可以就会在心跳返回消息中带上leaseId(HeartbeatResponseProto.fullBlockReportLeaseId),DN获得lease后,就可以向NN发送block report了。

NN的 BlockReportLeaseManager 中维护了两个双向链表(deferredHead、pendingHead),pendingHead代表当前拥有report lease的datanode节点。通过numPending与maxPending来控制datanode block report lease的发放,当datanode获得lease时,会从deferedHead移动到pendingHead链表,失去lease或执行完report会在相反的方向移动。lease的有效期为 dfs.namenode.full.block.report.lease.length.ms 默认 5L * 60L * 1000L,最多可以有 dfs.namenode.max.full.block.report.leases 个 datanode 同时拥有lease,默认 6 个。

NameNode兼容未使用流控的DataNode,DN使用旧策略向NN发送心跳,由于旧心跳不会设置requestFullBlockReportLease字段,NN获得到的requestFullBlockReportLease字段值为默认值false,当为false时,不会向BlockManager申请report的leaseId。
DN根据下次执行的时间和当前时间比较决定(下次执行时间第一次设置为Scheduler对象初始化时间,因此可以保证第一次心跳动作后,会执行一次BR)是否发送BR,参数BlockReportContextProto.leaseId默认值为0,当leaseId为0时,NN不检查lease。

升级步骤:
先重组NN,然后再滚动升级DN

执行的流程图如下:
hdfs-7923

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 新增加两个配置
<property>
  <name>dfs.namenode.max.full.block.report.leases</name>
  <value>6</value>
  <description>The maximum number of leases for full block reports that the
    NameNode will issue at any given time.  This prevents the NameNode from
    being flooded with full block reports that use up all the RPC handler
    threads.  This number should never be more than the number of RPC handler
    threads or less than 1.
  </description>
</property>
 
<property>
  <name>dfs.namenode.full.block.report.lease.length.ms</name>
  <value>300000</value>
  <description>
    The number of milliseconds that the NameNode will wait before invalidating
    a full block report lease.  This prevents a crashed DataNode from
    permanently using up a full block report lease.
  </description>
</property>

二、代码走查

启动工作线程

1. 通信:BPServiceActor,IpcServer,DataXceiverServer,localDataXceiverServer
2. 监控:JVMPauseMonitor,DU(dfsUsage)
3. 其他:InfoServer

入口

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# org.apache.hadoop.hdfs.server.datanode.DataNode
public static void main(String args[]) {
	if (DFSUtil.parseHelpArgument(args, DataNode.USAGE, System.out, true)) {
	  System.exit(0);
	}
 
	secureMain(args, null);
}
 
public static void secureMain(String args[], SecureResources resources) {
	int errorCode = 0;
	try {
	  StringUtils.startupShutdownMessage(DataNode.class, args, LOG);
	  // 创建DataNode
	  DataNode datanode = createDataNode(args, null, resources);
	  if (datanode != null) {
	    // 注:只需要等待BPServiceActor线程结束
	    datanode.join();
	  } else {
	    errorCode = 1;
	  }
	} catch (Throwable e) {
	  LOG.fatal("Exception in secureMain", e);
	  terminate(1, e);
	} finally {
	  // We need to terminate the process here because either shutdown was called
	  // or some disk related conditions like volumes tolerated or volumes required
	  // condition was not met. Also, In secure mode, control will go to Jsvc
	  // and Datanode process hangs if it does not exit.
	  LOG.warn("Exiting Datanode");
	  terminate(errorCode);
	}
}
 
  public static DataNode createDataNode(String args[], Configuration conf,
      SecureResources resources) throws IOException {
    // 初始化DataNode,并完成部分工作线程的启动
    DataNode dn = instantiateDataNode(args, conf, resources);
    if (dn != null) {
      // 启动剩余工作线程,dataXceiverServer,localDataXceiverServer,ipcServer等
      dn.runDatanodeDaemon();
    }
    return dn;
  }
 
   public static DataNode instantiateDataNode(String args [], Configuration conf,
      SecureResources resources) throws IOException {
    if (conf == null)
      conf = new HdfsConfiguration();
 
    if (args != null) {
      // parse generic hadoop options
      GenericOptionsParser hParser = new GenericOptionsParser(conf, args);
      args = hParser.getRemainingArgs();
    }
 
    if (!parseArguments(args, conf)) {
      printUsage(System.err);
      return null;
    }
    // dfs.datanode.data.dir 可以[,]号分隔,如:[Disk]/storages/storage1/
    Collection<StorageLocation> dataLocations = getStorageLocations(conf);
    UserGroupInformation.setConfiguration(conf);
    SecurityUtil.login(conf, DFS_DATANODE_KEYTAB_FILE_KEY,
        DFS_DATANODE_KERBEROS_PRINCIPAL_KEY);
    return makeInstance(dataLocations, conf, resources);
  }
 
  /**
   * 确认所给的数据目录至少有一个可以创建(如果父目录不存在也要可以创建)然后实例化DataNode
   */
  static DataNode makeInstance(Collection<StorageLocation> dataDirs,
      Configuration conf, SecureResources resources) throws IOException {
    LocalFileSystem localFS = FileSystem.getLocal(conf);
    FsPermission permission = new FsPermission(
        conf.get(DFS_DATANODE_DATA_DIR_PERMISSION_KEY,
                 DFS_DATANODE_DATA_DIR_PERMISSION_DEFAULT));
    // 磁盘检查类,需要时创建本地目录,检查权限并且保证可以对目录进行读写
    DataNodeDiskChecker dataNodeDiskChecker =
        new DataNodeDiskChecker(permission);
    // 过滤出可以有rwx的目录,TODO: 注意不能rwx的目录需要检查
    List<StorageLocation> locations =
        checkStorageLocations(dataDirs, localFS, dataNodeDiskChecker);
    DefaultMetricsSystem.initialize("DataNode");
 
    assert locations.size() > 0 : "number of data directories should be > 0";
    return new DataNode(conf, locations, resources);
  }
 
  /**
   * Create the DataNode given a configuration, an array of dataDirs,
   * and a namenode proxy
   */
  DataNode(final Configuration conf,
           final List<StorageLocation> dataDirs,
           final SecureResources resources) throws IOException {
    ......
    try {
      hostName = getHostName(conf);
      LOG.info("Configured hostname is " + hostName);
      startDataNode(conf, dataDirs, resources);
    } catch (IOException ie) {
      shutdown();
      throw ie;
    }
    ......
  }
 
  void startDataNode(Configuration conf,
                     List<StorageLocation> dataDirs,
                     SecureResources resources
                     ) throws IOException {
 
    // settings global for all BPs in the Data Node
    this.secureResources = resources;
    synchronized (this) {
      this.dataDirs = dataDirs;
    }
    this.conf = conf;
    this.dnConf = new DNConf(conf);
    checkSecureConfig(dnConf, conf, resources);
 
    this.spanReceiverHost =
      SpanReceiverHost.get(conf, DFSConfigKeys.DFS_SERVER_HTRACE_PREFIX);
 
    if (dnConf.maxLockedMemory > 0) {
      if (!NativeIO.POSIX.getCacheManipulator().verifyCanMlock()) {
        throw new RuntimeException(String.format(
            "Cannot start datanode because the configured max locked memory" +
            " size (%s) is greater than zero and native code is not available.",
            DFS_DATANODE_MAX_LOCKED_MEMORY_KEY));
      }
      if (Path.WINDOWS) {
        NativeIO.Windows.extendWorkingSetSize(dnConf.maxLockedMemory);
      } else {
        long ulimit = NativeIO.POSIX.getCacheManipulator().getMemlockLimit();
        if (dnConf.maxLockedMemory > ulimit) {
          throw new RuntimeException(String.format(
            "Cannot start datanode because the configured max locked memory" +
            " size (%s) of %d bytes is more than the datanode's available" +
            " RLIMIT_MEMLOCK ulimit of %d bytes.",
            DFS_DATANODE_MAX_LOCKED_MEMORY_KEY,
            dnConf.maxLockedMemory,
            ulimit));
        }
      }
    }
    LOG.info("Starting DataNode with maxLockedMemory = " +
        dnConf.maxLockedMemory);
 
    // 初始化DataStorage
    storage = new DataStorage();
 
    // global DN settings
    // 注册JMX
    registerMXBean();
    // https://blog.csdn.net/lipeng_bigdata/article/details/50828066
    // 初始化:DataXceiverServer,这个服务是用来接收、发送数据块,它从客户端或其它DataNode接收请求,流式通信,在DataNode#runDatanodeDaemon()中启动
    initDataXceiver(conf);
    // 启动InfoServer(WEB UI)
    startInfoServer(conf);
    // 启动JvmPauseMonitor,反射监控JVM,可以通过JMS查询
    pauseMonitor = new JvmPauseMonitor(conf);
    pauseMonitor.start();
 
    // BlockPoolTokenSecretManager is required to create ipc server.
    this.blockPoolTokenSecretManager = new BlockPoolTokenSecretManager();
 
    // Login is done by now. Set the DN user name.
    dnUserName = UserGroupInformation.getCurrentUser().getShortUserName();
    LOG.info("dnUserName = " + dnUserName);
    LOG.info("supergroup = " + supergroup);
    // 初始化IPC服务,在DataNode#runDatanodeDaemon()中启动
    initIpcServer(conf);
 
    metrics = DataNodeMetrics.create(conf, getDisplayName());
    metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
 
    // 初始化BlockPoolManager
    blockPoolManager = new BlockPoolManager(this);
    blockPoolManager.refreshNamenodes(conf);
 
    // Create the ReadaheadPool from the DataNode context so we can
    // exit without having to explicitly shutdown its thread pool.
    readaheadPool = ReadaheadPool.getInstance();
    saslClient = new SaslDataTransferClient(dnConf.conf,
        dnConf.saslPropsResolver, dnConf.trustedChannelResolver);
    saslServer = new SaslDataTransferServer(dnConf, blockPoolTokenSecretManager);
  }

BlockPoolManager & BPOfferService

1. BlockPoolManager: 管理DataNode上所有的块池,对象的创建、删除、启动、停止、关闭必须通过这个类的API完成。每一个DataNode都有一个BlockManager的实例。
2. BPOfferService: 在DataNode上每个块池/名称空间一个实例,它处理名称空间的活动和备用NN的心跳。 此类为每个NN管理一个 BPServiceActor 实例,并将调用委托给这两个NN。 它还维护关于哪些NN被视为活动的状态。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
private void doRefreshNamenodes(
      Map<String, Map<String, InetSocketAddress>> addrMap) throws IOException {
    assert Thread.holdsLock(refreshNamenodesLock);
 
    Set<String> toRefresh = Sets.newLinkedHashSet();
    Set<String> toAdd = Sets.newLinkedHashSet();
    Set<String> toRemove;
 
    synchronized (this) {
      // Step 1. For each of the new nameservices, figure out whether
      // it's an update of the set of NNs for an existing NS,
      // or an entirely new nameservice.
      // 启动时bpByNaeserviceId是空集合
      for (String nameserviceId : addrMap.keySet()) {
        if (bpByNameserviceId.containsKey(nameserviceId)) {
          toRefresh.add(nameserviceId);
        } else {
          toAdd.add(nameserviceId);
        }
      }
 
      // Step 2. Any nameservices we currently have but are no longer present
      // need to be removed.
      toRemove = Sets.newHashSet(Sets.difference(
          bpByNameserviceId.keySet(), addrMap.keySet()));
 
      assert toRefresh.size() + toAdd.size() ==
        addrMap.size() :
          "toAdd: " + Joiner.on(",").useForNull("<default>").join(toAdd) +
          "  toRemove: " + Joiner.on(",").useForNull("<default>").join(toRemove) +
          "  toRefresh: " + Joiner.on(",").useForNull("<default>").join(toRefresh);
 
 
      // Step 3. Start new nameservices
      if (!toAdd.isEmpty()) {
        LOG.info("Starting BPOfferServices for nameservices: " +
            Joiner.on(",").useForNull("<default>").join(toAdd));
 
        for (String nsToAdd : toAdd) {
          ArrayList<InetSocketAddress> addrs =
            Lists.newArrayList(addrMap.get(nsToAdd).values());
          // 创建BPOfferService
          BPOfferService bpos = createBPOS(addrs);
          // 加入bpByNameserviceId集合,下次就会出现在toRefresh集合
          bpByNameserviceId.put(nsToAdd, bpos);
          offerServices.add(bpos);
        }
      }
      // 启动所有的BPOfferService,实际是启动下面的BPServiceActor
      startAll();
    }
 
    // Step 4. Shut down old nameservices. This happens outside
    // of the synchronized(this) lock since they need to call
    // back to .remove() from another thread
    if (!toRemove.isEmpty()) {
      LOG.info("Stopping BPOfferServices for nameservices: " +
          Joiner.on(",").useForNull("<default>").join(toRemove));
 
      for (String nsToRemove : toRemove) {
        BPOfferService bpos = bpByNameserviceId.get(nsToRemove);
        bpos.stop();
        bpos.join();
        // they will call remove on their own
      }
    }
 
    // Step 5. Update nameservices whose NN list has changed
    if (!toRefresh.isEmpty()) {
      LOG.info("Refreshing list of NNs for nameservices: " +
          Joiner.on(",").useForNull("<default>").join(toRefresh));
 
      for (String nsToRefresh : toRefresh) {
        BPOfferService bpos = bpByNameserviceId.get(nsToRefresh);
        ArrayList<InetSocketAddress> addrs =
          Lists.newArrayList(addrMap.get(nsToRefresh).values());
        bpos.refreshNNList(addrs);
      }
    }
  }
 
  BPOfferService(List<InetSocketAddress> nnAddrs, DataNode dn) {
    Preconditions.checkArgument(!nnAddrs.isEmpty(),
        "Must pass at least one NN.");
    this.dn = dn;
 
    for (InetSocketAddress addr : nnAddrs) {
      this.bpServices.add(new BPServiceActor(addr, this));
    }
  }

BPServiceActor

startAll(); # 启动所有的BPOfferService,BPOfferService又会启动BPServiceActor的bpThread线程,bpThread线程的run执行逻辑是:

1. 与NameNode进行第一次握手,获取命名空间的信息
2. 向NameNode注册当前DataNode
3. 周期性发送心跳到namenode,增量块汇报,全量块汇报,缓存块汇报等
4. 处理来自namenode的命令

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  # BPServiceActor.run()
  public void run() {
    LOG.info(this + " starting to offer service");
 
    try {
      while (true) {
        // init stuff
        try {
          // setup storage
          // 和namenode注册
          // TODO: 优化
          connectToNNAndHandshake();
          break;
        } catch (IOException ioe) {
          // Initial handshake, storage recovery or registration failed
          runningState = RunningState.INIT_FAILED;
          if (shouldRetryInit()) {
            // Retry until all namenode's of BPOS failed initialization
            LOG.error("Initialization failed for " + this + " "
                + ioe.getLocalizedMessage());
            sleepAndLogInterrupts(5000, "initializing");
          } else {
            runningState = RunningState.FAILED;
            LOG.fatal("Initialization failed for " + this + ". Exiting. ", ioe);
            return;
          }
        }
      }
 
      runningState = RunningState.RUNNING;
 
      while (shouldRun()) {
        try {
          offerService();
        } catch (Exception ex) {
          LOG.error("Exception in BPOfferService for " + this, ex);
          sleepAndLogInterrupts(5000, "offering service");
        }
      }
      runningState = RunningState.EXITED;
    } catch (Throwable ex) {
      LOG.warn("Unexpected exception in block pool " + this, ex);
      runningState = RunningState.FAILED;
    } finally {
      LOG.warn("Ending block pool service for: " + this);
      cleanUp();
    }
  }
 
  private void connectToNNAndHandshake() throws IOException {
    // get NN proxy
    bpNamenode = dn.connectToNN(nnAddr);
 
    // First phase of the handshake with NN - get the namespace
    // info.
    NamespaceInfo nsInfo = retrieveNamespaceInfo();
 
    // Verify that this matches the other NN in this HA pair.
    // This also initializes our block pool in the DN if we are
    // the first NN connection for this BP.
    bpos.verifyAndSetNamespaceInfo(nsInfo);
 
    // Second phase of the handshake with the NN.
    register(nsInfo);
  }
 
  # BPOfferService.verifyAndSetNamespaceInfo(NamespaceInfo nsInfo)
  /**
   * Called by the BPServiceActors when they handshake to a NN.
   * If this is the first NN connection, this sets the namespace info
   * for this BPOfferService. If it's a connection to a new NN, it
   * verifies that this namespace matches (eg to prevent a misconfiguration
   * where a StandbyNode from a different cluster is specified)
   */
  void verifyAndSetNamespaceInfo(NamespaceInfo nsInfo) throws IOException {
    writeLock();
    try {
      if (this.bpNSInfo == null) {
        this.bpNSInfo = nsInfo;
        boolean success = false;
 
        // Now that we know the namespace ID, etc, we can pass this to the DN.
        // The DN can now initialize its local storage if we are the
        // first BP to handshake, etc.
        try {
          dn.initBlockPool(this);
          success = true;
        } finally {
          if (!success) {
            // The datanode failed to initialize the BP. We need to reset
            // the namespace info so that other BPService actors still have
            // a chance to set it, and re-initialize the datanode.
            this.bpNSInfo = null;
          }
        }
      } else {
        checkNSEquality(bpNSInfo.getBlockPoolID(), nsInfo.getBlockPoolID(),
            "Blockpool ID");
        checkNSEquality(bpNSInfo.getNamespaceID(), nsInfo.getNamespaceID(),
            "Namespace ID");
        checkNSEquality(bpNSInfo.getClusterID(), nsInfo.getClusterID(),
            "Cluster ID");
      }
    } finally {
      writeUnlock();
    }
  }

dn.initBlockPool(this);# DataNode.initBlockPool

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# DataNode.initBlockPool(BPOfferService bpos)
/**
   * One of the Block Pools has successfully connected to its NN.
   * This initializes the local storage for that block pool,
   * checks consistency of the NN's cluster ID, etc.
   *
   * If this is the first block pool to register, this also initializes
   * the datanode-scoped storage.
   *
   * @param bpos Block pool offer service
   * @throws IOException if the NN is inconsistent with the local storage.
   */
  void initBlockPool(BPOfferService bpos) throws IOException {
    NamespaceInfo nsInfo = bpos.getNamespaceInfo();
    if (nsInfo == null) {
      throw new IOException("NamespaceInfo not found: Block pool " + bpos
          + " should have retrieved namespace info before initBlockPool.");
    }
 
    setClusterId(nsInfo.clusterID, nsInfo.getBlockPoolID());
 
    // Register the new block pool with the BP manager.
    blockPoolManager.addBlockPool(bpos);
 
    // In the case that this is the first block pool to connect, initialize
    // the dataset, block scanners, etc.
    initStorage(nsInfo);
 
    // Exclude failed disks before initializing the block pools to avoid startup
    // failures.
    checkDiskError();
 
    data.addBlockPool(nsInfo.getBlockPoolID(), conf);
    blockScanner.enableBlockPoolId(bpos.getBlockPoolId());
    initDirectoryScanner(conf);
  }

initStorage(nsInfo);

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
  # DataNode.initStorage(final NamespaceInfo nsInfo)
  // 8c0638471f8f1dd47667b2d6727d4d2d54e4b48c Tue Aug 09 03:02:53 CST 2016    Arpit Agarwal   HADOOP-10682. Replace FsDatasetImpl object lock with a separate lock object. (Chen Liang)
  // TODO: 查看initStorage(nsInfo)方法的实现
  /**
   * Initializes the {@link #data}. The initialization is done only once, when
   * handshake with the the first namenode is completed.
   */
  private void initStorage(final NamespaceInfo nsInfo) throws IOException {
    final FsDatasetSpi.Factory<? extends FsDatasetSpi<?>> factory
        = FsDatasetSpi.Factory.getFactory(conf);
 
    if (!factory.isSimulated()) {
      final StartupOption startOpt = getStartupOption(conf);
      if (startOpt == null) {
        throw new IOException("Startup option not set.");
      }
      final String bpid = nsInfo.getBlockPoolID();
      //read storage info, lock data dirs and transition fs state if necessary
      synchronized (this) {
        storage.recoverTransitionRead(this, nsInfo, dataDirs, startOpt);
      }
      final StorageInfo bpStorage = storage.getBPStorage(bpid);
      LOG.info("Setting up storage: nsid=" + bpStorage.getNamespaceID()
          + ";bpid=" + bpid + ";lv=" + storage.getLayoutVersion()
          + ";nsInfo=" + nsInfo + ";dnuuid=" + storage.getDatanodeUuid());
    }
 
    // If this is a newly formatted DataNode then assign a new DatanodeUuid.
    checkDatanodeUuid();
 
    synchronized(this)  {
      if (data == null) {
        data = factory.newInstance(this, storage, conf);
      }
    }
  }
 
  // factory使用的初始化语句为
  /**
 * A factory for creating {@link FsDatasetImpl} objects.
 */
public class FsDatasetFactory extends FsDatasetSpi.Factory<FsDatasetImpl> {
  @Override
  public FsDatasetImpl newInstance(DataNode datanode,
      DataStorage storage, Configuration conf) throws IOException {
    return new FsDatasetImpl(datanode, storage, conf);
  }
}
 
// 查看FsDatasetImpl的构造函数
/**
   * An FSDataset has a directory where it loads its data files.
   */
  FsDatasetImpl(DataNode datanode, DataStorage storage, Configuration conf
      ) throws IOException {
    this.fsRunning = true;
    this.datanode = datanode;
    this.dataStorage = storage;
    this.conf = conf;
    // The number of volumes required for operation is the total number
    // of volumes minus the number of failed volumes we can tolerate.
    final int volFailuresTolerated =
      conf.getInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY,
                  DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_DEFAULT);
 
    String[] dataDirs = conf.getTrimmedStrings(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY);
    Collection<StorageLocation> dataLocations = DataNode.getStorageLocations(conf);
    List<VolumeFailureInfo> volumeFailureInfos = getInitialVolumeFailureInfos(
        dataLocations, storage);
 
    int volsConfigured = (dataDirs == null) ? 0 : dataDirs.length;
    int volsFailed = volumeFailureInfos.size();
    this.validVolsRequired = volsConfigured - volFailuresTolerated;
 
    if (volFailuresTolerated < 0 || volFailuresTolerated >= volsConfigured) {
      throw new DiskErrorException("Invalid volume failure "
          + " config value: " + volFailuresTolerated);
    }
    if (volsFailed > volFailuresTolerated) {
      throw new DiskErrorException("Too many failed volumes - "
          + "current valid volumes: " + storage.getNumStorageDirs()
          + ", volumes configured: " + volsConfigured
          + ", volumes failed: " + volsFailed
          + ", volume failures tolerated: " + volFailuresTolerated);
    }
 
    storageMap = new ConcurrentHashMap<String, DatanodeStorage>();
    volumeMap = new ReplicaMap(this);
    ramDiskReplicaTracker = RamDiskReplicaTracker.getInstance(conf, this);
 
    @SuppressWarnings("unchecked")
    final VolumeChoosingPolicy<FsVolumeImpl> blockChooserImpl =
        ReflectionUtils.newInstance(conf.getClass(
            DFSConfigKeys.DFS_DATANODE_FSDATASET_VOLUME_CHOOSING_POLICY_KEY,
            RoundRobinVolumeChoosingPolicy.class,
            VolumeChoosingPolicy.class), conf);
    // volumes对象生成
    volumes = new FsVolumeList(volumeFailureInfos, datanode.getBlockScanner(),
        blockChooserImpl);
    asyncDiskService = new FsDatasetAsyncDiskService(datanode, this);
    asyncLazyPersistService = new RamDiskAsyncLazyPersistService(datanode);
    deletingBlock = new HashMap<String, Set<Long>>();
    // 添加volume到volumes对象
    for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
      addVolume(dataLocations, storage.getStorageDir(idx));
    }
    setupAsyncLazyPersistThreads();
 
    cacheManager = new FsDatasetCache(this);
 
    // Start the lazy writer once we have built the replica maps.
    lazyWriter = new Daemon(new LazyWriter(conf));
    lazyWriter.start();
    registerMBean(datanode.getDatanodeUuid());
    localFS = FileSystem.getLocal(conf);
    blockPinningEnabled = conf.getBoolean(
      DFSConfigKeys.DFS_DATANODE_BLOCK_PINNING_ENABLED,
      DFSConfigKeys.DFS_DATANODE_BLOCK_PINNING_ENABLED_DEFAULT);
  }
  // addVolume的实现
  private void addVolume(Collection<StorageLocation> dataLocations,
      Storage.StorageDirectory sd) throws IOException {
    final File dir = sd.getCurrentDir();
    final StorageType storageType =
        getStorageTypeFromLocations(dataLocations, sd.getRoot());
 
    // If IOException raises from FsVolumeImpl() or getVolumeMap(), there is
    // nothing needed to be rolled back to make various data structures, e.g.,
    // storageMap and asyncDiskService, consistent.
    FsVolumeImpl fsVolume = new FsVolumeImpl(
        this, sd.getStorageUuid(), dir, this.conf, storageType);
    FsVolumeReference ref = fsVolume.obtainReference();
    ReplicaMap tempVolumeMap = new ReplicaMap(this);
    fsVolume.getVolumeMap(tempVolumeMap, ramDiskReplicaTracker);
 
    // TODO: 锁优化,拆分锁,调整锁定范围
    synchronized (this) {
      volumeMap.addAll(tempVolumeMap);
      storageMap.put(sd.getStorageUuid(),
          new DatanodeStorage(sd.getStorageUuid(),
              DatanodeStorage.State.NORMAL,
              storageType));
      asyncDiskService.addVolume(sd.getCurrentDir());
      volumes.addVolume(ref);
    }
 
    LOG.info("Added volume - " + dir + ", StorageType: " + storageType);
  }

checkDiskError();删除坏掉的磁盘

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# DataNode.checkDiskError()
// TODO: 优化点,可并行检查,并且不需要锁定
// f678080dbd25a218e0406463a3c3a1fc03680702	Wed Dec 21 05:53:32 CST 2016	Xiaoyu Yao	HDFS-11182. Update DataNode to use DatasetVolumeChecker. Contributed by Arpit Agarwal.
// eaaa32950cbae42a74e28e3db3f0cdb1ff158119	Wed Nov 30 12:31:02 CST 2016	Arpit Agarwal	HDFS-11149. Support for parallel checking of FsVolumes.
/**
   * Check the disk error
   */
  private void checkDiskError() {
    Set<File> unhealthyDataDirs = data.checkDataDir();
    if (unhealthyDataDirs != null && !unhealthyDataDirs.isEmpty()) {
      try {
        // Remove all unhealthy volumes from DataNode.
        removeVolumes(unhealthyDataDirs, false);
      } catch (IOException e) {
        LOG.warn("Error occurred when removing unhealthy storage dirs: "
            + e.getMessage(), e);
      }
      StringBuilder sb = new StringBuilder("DataNode failed volumes:");
      for (File dataDir : unhealthyDataDirs) {
        sb.append(dataDir.getAbsolutePath() + ";");
      }
      handleDiskError(sb.toString());
    }
  }
 
  # FsDatasetImpl.checkDirs()
  Set<File> checkDirs() {
    synchronized(checkDirsMutex) {
      Set<File> failedVols = null;
 
      // Make a copy of volumes for performing modification
      final List<FsVolumeImpl> volumeList = getVolumes();
 
      for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
        final FsVolumeImpl fsv = i.next();
        try (FsVolumeReference ref = fsv.obtainReference()) {
          fsv.checkDirs();
        } catch (DiskErrorException e) {
          FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ", e);
          if (failedVols == null) {
            failedVols = new HashSet<>(1);
          }
          failedVols.add(new File(fsv.getBasePath()).getAbsoluteFile());
          addVolumeFailureInfo(fsv);
          removeVolume(fsv);
        } catch (ClosedChannelException e) {
          FsDatasetImpl.LOG.debug("Caught exception when obtaining " +
            "reference count on closed volume", e);
        } catch (IOException e) {
          FsDatasetImpl.LOG.error("Unexpected IOException", e);
        }
      }
 
      if (failedVols != null && failedVols.size() > 0) {
        FsDatasetImpl.LOG.warn("Completed checkDirs. Found " + failedVols.size()
            + " failure volumes.");
      }
 
      return failedVols;
    }
  }

data.addBlockPool(nsInfo.getBlockPoolID(), conf);

1
2
3
4
5
6
7
8
9
10
@Override
  public void addBlockPool(String bpid, Configuration conf)
      throws IOException {
    LOG.info("Adding block pool " + bpid);
    synchronized(this) {
      volumes.addBlockPool(bpid, conf);
      volumeMap.initBlockPool(bpid);
    }
    volumes.getAllVolumesMap(bpid, volumeMap, ramDiskReplicaTracker);
  }

BlockPoolSlice

关于du的作用及优化:

在linux系统上,该线程将定期通过du -sk命令统计各blockpool目录的占用情况,随着心跳汇报给namenode。

执行linux命令需要从JVM继承fork出子进程,成本较高(尽管linux使用COW策略避免了对内存空间的完全copy)。为了加快datanode启动速度,此处允许使用之前缓存的dfsUsage值,该值保存在current目录下的dfsUsed文件中;缓存的dfsUsage会定期(fs.du.interval,默认600秒)持久化到磁盘中;在虚拟机关闭时,也会将当前的dfsUsage值持久化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
void addBlockPool(final String bpid, final Configuration conf) throws IOException {
    long totalStartTime = Time.monotonicNow();
 
    final List<IOException> exceptions = Collections.synchronizedList(
        new ArrayList<IOException>());
    List<Thread> blockPoolAddingThreads = new ArrayList<Thread>();
    for (final FsVolumeImpl v : volumes.get()) {
      Thread t = new Thread() {
        public void run() {
          try (FsVolumeReference ref = v.obtainReference()) {
            FsDatasetImpl.LOG.info("Scanning block pool " + bpid +
                " on volume " + v + "...");
            long startTime = Time.monotonicNow();
            // 关注
            v.addBlockPool(bpid, conf);
            long timeTaken = Time.monotonicNow() - startTime;
            FsDatasetImpl.LOG.info("Time taken to scan block pool " + bpid +
                " on " + v + ": " + timeTaken + "ms");
          } catch (ClosedChannelException e) {
            // ignore.
          } catch (IOException ioe) {
            FsDatasetImpl.LOG.info("Caught exception while scanning " + v +
                ". Will throw later.", ioe);
            exceptions.add(ioe);
          }
        }
      };
      blockPoolAddingThreads.add(t);
      t.start();
    }
    for (Thread t : blockPoolAddingThreads) {
      try {
        t.join();
      } catch (InterruptedException ie) {
        throw new IOException(ie);
      }
    }
    if (!exceptions.isEmpty()) {
      throw exceptions.get(0);
    }
 
    long totalTimeTaken = Time.monotonicNow() - totalStartTime;
    FsDatasetImpl.LOG.info("Total time to scan all replicas for block pool " +
        bpid + ": " + totalTimeTaken + "ms");
  }
 
  void addBlockPool(String bpid, Configuration conf) throws IOException {
    File bpdir = new File(currentDir, bpid);
    BlockPoolSlice bp = new BlockPoolSlice(bpid, this, bpdir, conf);
    bpSlices.put(bpid, bp);
  }

volumeMap.initBlockPool(bpid);

优化性能和修复锁使用的Bug

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// 8ae4729107d33c6001cf1fdc8837afb71ea6c0d3	Wed Sep 28 01:02:15 CST 2016	Arpit Agarwal	HDFS-10828. Fix usage of FsDatasetImpl object lock in ReplicaMap. (Arpit Agarwal) HDFS-10682 replaced the FsDatasetImpl object lock with a separate reentrant lock but missed updating an instance ReplicaMap still uses the FsDatasetImpl.
// dd9ebf6eedfd4ff8b3486eae2a446de6b0c7fa8a	Wed Feb 03 03:23:00 CST 2016	Colin Patrick Mccabe	HDFS-9260. Improve the performance and GC friendliness of NameNode startup and full block reports (Staffan Friberg via cmccabe)
// d6fa34e014b0e2a61b24f05dd08ebe12354267fd	Tue Sep 29 16:20:35 CST 2015	yliu	HDFS-8859. Improve DataNode ReplicaMap memory footprint to save about 45%. (yliu)
 
# ReplicaMap.java
void initBlockPool(String bpid) {
    checkBlockPool(bpid);
    synchronized(mutex) {
      Map<Long, ReplicaInfo> m = map.get(bpid);
      if (m == null) {
        // Add an entry for block pool if it does not exist already
        m = new HashMap<Long, ReplicaInfo>();
        map.put(bpid, m);
      }
    }
  }
  // FsDatasetImpl
  static ReplicaRecoveryInfo initReplicaRecovery(String bpid, ReplicaMap map,
      Block block, long recoveryId, long xceiverStopTimeout) throws IOException

volumes.getAllVolumesMap(bpid, volumeMap, ramDiskReplicaTracker);
增加缓存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// fc1031af749435dc95efea6745b1b2300ce29446	Thu Mar 26 03:42:59 CST 2015	Kihwal Lee	HDFS-7928. Scanning blocks from disk during rolling upgrade startup takes a lot of time if disks are busy. Contributed by Rushabh Shah.
void getVolumeMap(ReplicaMap volumeMap,
                    final RamDiskReplicaTracker lazyWriteReplicaMap)
      throws IOException {
    // Recover lazy persist replicas, they will be added to the volumeMap
    // when we scan the finalized directory.
    if (lazypersistDir.exists()) {
      int numRecovered = moveLazyPersistReplicasToFinalized(lazypersistDir);
      FsDatasetImpl.LOG.info(
          "Recovered " + numRecovered + " replicas from " + lazypersistDir);
    }
 
    // add finalized replicas
    addToReplicasMap(volumeMap, finalizedDir, lazyWriteReplicaMap, true);
    // add rbw replicas
    addToReplicasMap(volumeMap, rbwDir, lazyWriteReplicaMap, false);
  }
未分类

qcow2镜像定制指南

背景

目前网络上关于定制镜像的说明很分散,需要搜寻很多文章才能完成镜像的定制任务。所以我尝试提供一个全面而系统的指南,遵循本指南,用户可以方便的完成镜像的定制。

实施步骤

一、环境配置

1、准备软件
mac pro、VmWare fusion、CentOS-7-x86_64-DVD-1708.iso、CentOS-7-x86_64-GenericCloud-1708-20180123.qcow2
2、安装嵌套CentOs环境
由于MacOs不支持Kvm,故需要在嵌套的操作系统中安装云镜像需要的软件,使用Fusion很容易在MacOs中虚拟出一个CentOs的环境。
3、修改嵌套操作系统配置
在centos关闭的情况下打开虚拟机“处理器和内存”配置,选择“高级配置”,选中“在此虚拟机中启用虚拟化管理程序”和“在此虚拟机中启用代码分析应用程序”,如无这步操作,则在启动virt-manager时会报:“
virt-manager 报 WARNING : KVM不可用.这可能是因为没有安装KVM软件包,或者没有载入KVM内核模块.您的虚拟机可能性很差。”的错误,启动虚拟机。以下操作如无特殊说明都是在嵌套操作系统中执行。
4、安装依赖

1
2
3
yum install qemu-kvm qemu-img qemu-kvm-tools qemu-kvm-common
yum install libvirt-admin libvirt-client libvirt-daemon libvirt-devel libvirt
yum install libguestfs libguestfs-tools libguestfs-bash-completion

5、编译nbd内核模块(如不使用“nbd挂载方式修改镜像”则不需要安装此模块)
执行命令,出现以下报错时,说明没有nbd模块,需要自己手动安装

1
2
modprobe nbd
modprobe: FATAL: Module nbd not found.

执行下面的命令安装hbd模块

1
2
3
4
5
6
[@xx] # cat /etc/redhat-release
CentOS Linux release 7.4.1708 (Core)
[@xx] # uname -a
Linux localhost 3.10.0-693.el7.x86_64 #1 SMP Tue Aug 22 21:09:27 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
[] # uname -r
3.10.0-693.el7.x86_64

安装kernel组件

1
[@xx] sudo yum install kernel-devel kernel-headers

编译安装hbd组件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# 下载对应的内核源码包
[@xx] # wget http://vault.centos.org/7.4.1708/os/Source/SPackages/kernel-3.10.0-693.el7.src.rpm
[@xx] # rpm -ihv kernel-3.10.0-693.el7.src.rpm
[@xx] # cd /root/rpmbuild/SOURCES/
[@xx] # tar Jxvf linux-3.10.0-123.el7.tar.xz -C /usr/src/kernels/
[@xx] # cd /usr/src/kernels/
# 配置源码
[@xx] # mv $(uname -r) $(uname -r)-old
[@xx] # mv linux-3.10.0-693.el7 $(uname -r)
# 编译安装
[@xx 3.10.0-693.el7.x86_64] # cd $(uname -r)
[@xx 3.10.0-693.el7.x86_64] # make mrproper
[@xx 3.10.0-693.el7.x86_64] # cp ../$(uname -r)-old/Module.symvers ./
[@xx 3.10.0-693.el7.x86_64] # cp /boot/config-$(uname -r) ./.config
[@xx 3.10.0-693.el7.x86_64] # make oldconfig
[@xx 3.10.0-693.el7.x86_64] # make prepare
[@xx 3.10.0-693.el7.x86_64] # make scripts
[@xx 3.10.0-693.el7.x86_64] # make CONFIG_BLK_DEV_NBD=m M=drivers/block
[@xx 3.10.0-693.el7.x86_64] # cp drivers/block/nbd.ko /lib/modules/$(uname -r)/kernel/drivers/block/
[@xx 3.10.0-693.el7.x86_64] # depmod -a
# 查看nbd模块
[@xx 3.10.0-693.el7.x86_64]$ modinfo nbd
filename:       /lib/modules/3.10.0-693.el7.x86_64/kernel/drivers/block/nbd.ko
license:        GPL
description:    Network Block Device
rhelversion:    7.4
srcversion:     EDE909A294AC5FE08E81957
depends:        
vermagic:       3.10.0 SMP mod_unload modversions 
parm:           nbds_max:number of network block devices to initialize (default: 16) (int)
parm:           max_part:number of partitions per device (default: 0) (int)
parm:           debugflags:flags for controlling debug output (int)

编译安装时的错误处理
阶段:make CONFIG_BLK_DEV_NBD=m M=drivers/block

1
2
3
4
5
6
7
drivers/block/nbd.c: 在函数‘__nbd_ioctl’中:
drivers/block/nbd.c:619:19: 错误:‘REQ_TYPE_SPECIAL’未声明(在此函数内第一次使用)
   sreq.cmd_type = REQ_TYPE_SPECIAL;
                   ^
drivers/block/nbd.c:619:19: 附注:每个未声明的标识符在其出现的函数内只报告一次
make[1]: *** [drivers/block/nbd.o] 错误 1
make: *** [_module_drivers/block] 错误 2

处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
[@xx 3.10.0-693.el7.x86_64] # vim include/linux/blkdev.h
# 由代码可知 REQ_TYPE_SPECIAL = 7
/*
 * request command types
 */
enum rq_cmd_type_bits {
        REQ_TYPE_FS             = 1,    /* fs request */
        REQ_TYPE_BLOCK_PC,              /* scsi command */
        REQ_TYPE_SENSE,                 /* sense request */
        REQ_TYPE_PM_SUSPEND,            /* suspend request */
        REQ_TYPE_PM_RESUME,             /* resume request */
        REQ_TYPE_PM_SHUTDOWN,           /* shutdown request */
#ifdef __GENKSYMS__
        REQ_TYPE_SPECIAL,               /* driver defined type */
#else
        REQ_TYPE_DRV_PRIV,              /* driver defined type */
#endif
        /*
         * for ATA/ATAPI devices. this really doesn"&gt;cmd[0] with the range of driver
         * private REQ_LB opcodes to differentiate what type of request this is
         */
        REQ_TYPE_ATA_TASKFILE,
        REQ_TYPE_ATA_PC,
};
# 修改nbd.c文件
[@xx 3.10.0-693.el7.x86_64] # vim drivers/block/nbd.c
# sreq.cmd_type = REQ_TYPE_SPECIAL;
sreq.cmd_type = 7;
# 重新执行命令
[@xx 3.10.0-693.el7.x86_64] # make CONFIG_BLK_DEV_NBD=m M=drivers/block

二、设置镜像共享

设置嵌套虚拟机文件夹共享
qcow2文件放置在mac本地文件夹中,嵌套虚拟机通过文件共享的方式使用qcow2文件。需要注意的是qcow2文件权限需要在macos中设置为可读写,否则在嵌套虚拟机中无法更新配置。

1
[mac@] # chmod 755 ./CentOS-7-x86_64-GenericCloud-1708-20180123.qcow2

嵌套虚拟机中,需要要关闭SeLinux否则同样无法更新镜像内容

1
sudo /usr/sbin/setenforce 0

三、guestfish工具使用

1、示例程序:获取镜像ip地址

1
2
3
4
5
6
guestfish --rw -a ./CentOS-7-x86_64-GenericCloud-1708-20180123.qcow2
# run
# list-filesystems
/dev/sda1: xfs
# mount /dev/sda1 /
# vi /var/log/cloud-init.log

2、示例程序:配置用户访问权限

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
guestfish --rw -a ./CentOS-7-x86_64-GenericCloud-1708-20180123.qcow2
# run
# list-filesystems
/dev/sda1: xfs
# mount /dev/sda1 /
# mkdir /home/dc2-user/.ssh
# 注意guestfish与shell的区别,权限位是四位
# chown 1001 1001 /home/dc2-user/.ssh
# chmod 0700 /home/dc2-user/.ssh
# touch /home/dc2-user/.ssh/authorized_keys
# chmod 0600 /home/dc2-user/.ssh/authorized_keys
# chown 1001 1001 /home/dc2-user/.ssh/authorized_keys
# ll /home/dc2-user/.ssh/authorized_keys
-rw------- 1 1001 1001 0 Mar  1 10:05 /sysroot/home/dc2-user/.ssh/authorized_keys
# vi /home/dc2-user/.ssh/authorized_keys
# 添加公钥...
# quit

四、nbd挂载方式修改镜像(qemu-nbd)

1、确保已安装nbd模块,加载模块

1
2
# modinfo nbd
# insmod /lib/modules/3.10.0-693.el7.x86_64/kernel/drivers/block/nbd.ko max_part=8

3、建立nbd连接,挂载到目录

1
2
3
# qemu-nbd -c /dev/nbd0 ./CentOS-7-x86_64-GenericCloud-1708-20180123.qcow2
# mkdir -p /mnt/qcows/qcow0
# mount /dev/nbd0p1  /mnt/qcows/qcow0

4、执行chroot

1
# chroot /mnt/qcows/qcow0/

5、执行修改,比如

1
2
$ passwd dc2-user
# 或其它操作

6、修改完毕后解除挂载点,解除连接

1
2
# umount /mnt/qcows/qcow0
# qemu-nbd -d /dev/nbd0p1

五、通过virt-manager挂载虚拟机

1、执行

1
virt-manager

2、新建虚拟机
选择“导入现有磁盘”,“使用ISO镜像”,选择qcow2文件…
如果报:“WARNING : KVM不可用.这可能是因为没有安装KVM软件包,或者没有载入KVM内核模块.您的虚拟机可能性很差。”的警告相应的解决方案是:

  • 关闭虚拟机
  • 进入虚拟机设置(可以配置网卡,硬盘,光驱的地方)
  • 点击“处理器和内存”,勾选虚拟化Inter VT-x/EPT 或AMD-V/RVI(V)

3、登录机器,修改
IP地址和密码、公钥方式登录,在前面已说明如何操作

六、清理痕迹

1、清理/var/log/文件夹
2、删除cloud-init执行记录
cloud-init是专门为云环境的虚拟机初始化而开发的工具,通过读取相应的数据,对虚拟机进行配置。其执行完毕后会在一个叫 sem 的目录下创建信号文件,以便下次启动模块时不再重复执行语句。其目录位置为:
/var/lib/cloud/instances/实例ID/sem

1
2
sudo rm -rf /var/lib/cloud/instances
sudo rm -rf /var/lib/cloud/instance

3、清理history和.ssh目录等

1
2
3
4
5
rm -rf /home/xxx/.ssh
echo '' # /home/xxx/.bash_history
echo '' # /root/.bash_history
 
rm -rf /root/.oracle_jre_usage

七、去除磁盘空洞

1
2
3
4
5
6
# 创建同样大小的镜像
$ qemu-img create -f qcow2 CentOS-7-x86_64-GenericCloud-1708-20180329.qcow2 40G
$ virt-sparsify -x ./CentOS-7-x86_64-GenericCloud-1708-20180123.qcow2 --convert qcow2 ./CentOS-7-x86_64-GenericCloud-1708-20180329.qcow2
$ du -sh *
7.3G	CentOS-7-x86_64-GenericCloud-1708-20180123.qcow2
5.3G	CentOS-7-x86_64-GenericCloud-1708-20180329.qcow2