diff -Nru nova-22.0.1/api-guide/source/port_with_resource_request.rst nova-22.2.2/api-guide/source/port_with_resource_request.rst --- nova-22.0.1/api-guide/source/port_with_resource_request.rst 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/api-guide/source/port_with_resource_request.rst 2021-07-09 05:08:56.000000000 +0200 @@ -29,7 +29,9 @@ neutron ports having resource requests if both the source and destination compute services are upgraded to 20.0.0 (Train) and the ``[upgrade_levels]/compute`` configuration does not prevent the computes from -using the latest RPC version. +using the latest RPC version. However cross cell resize and cross cell migrate +operations are still not supported with such ports and Nova will fall back to +same-cell resize if the server has such ports. As of 21.0.0 (Ussuri), nova supports evacuating, live migrating and unshelving servers with neutron ports having resource requests. diff -Nru nova-22.0.1/debian/changelog nova-22.2.2/debian/changelog --- nova-22.0.1/debian/changelog 2021-07-12 12:57:03.000000000 +0200 +++ nova-22.2.2/debian/changelog 2021-07-09 13:16:57.000000000 +0200 @@ -1,3 +1,14 @@ +nova (2:22.2.2-1+deb11u1) unstable; urgency=medium + + * Tune nova-api-{,metadata-}uwsgi.ini for performance. + * New upstream release. + * CVE-2021-3654: novnc allows open redirection. Added upstream patch: + Reject_open_redirection_in_the_console_proxy.patch (Closes: #991441). + * Do not maintain glance_api_servers through debconf (as the default of + reading its URL in the Keystone catalogue is better). + + -- Thomas Goirand Fri, 09 Jul 2021 13:16:57 +0200 + nova (2:22.0.1-2) unstable; urgency=medium * Add depend on python3-q-text-as-data (Closes: #990705). diff -Nru nova-22.0.1/debian/nova-api-metadata-uwsgi.ini nova-22.2.2/debian/nova-api-metadata-uwsgi.ini --- nova-22.0.1/debian/nova-api-metadata-uwsgi.ini 2021-07-12 12:57:03.000000000 +0200 +++ nova-22.2.2/debian/nova-api-metadata-uwsgi.ini 2021-07-09 13:16:57.000000000 +0200 @@ -34,6 +34,23 @@ # exit instead of brutal reload on SIGTERM die-on-term = true +########################## +### Performance tuning ### +########################## +# Threads and processes +enable-threads = true + +# For max perf, set this to number of core*2 +processes = 8 + +# This was benchmarked as a good value +threads = 32 + +# This is the number of sockets in the queue. +# It improves a lot performances. This is comparable +# to the Apache ServerLimit/MaxClients option. +listen = 100 + ################################## ### OpenStack service specific ### ################################## diff -Nru nova-22.0.1/debian/nova-api-uwsgi.ini nova-22.2.2/debian/nova-api-uwsgi.ini --- nova-22.0.1/debian/nova-api-uwsgi.ini 2021-07-12 12:57:03.000000000 +0200 +++ nova-22.2.2/debian/nova-api-uwsgi.ini 2021-07-09 13:16:57.000000000 +0200 @@ -15,8 +15,6 @@ # Threads and processes enable-threads = true -processes = 4 - # uwsgi recommends this to prevent thundering herd on accept. thunder-lock = true @@ -34,6 +32,23 @@ # exit instead of brutal reload on SIGTERM die-on-term = true +########################## +### Performance tuning ### +########################## +# Threads and processes +enable-threads = true + +# For max perf, set this to number of core*2 +processes = 16 + +# This was benchmarked as a good value +threads = 1 + +# This is the number of sockets in the queue. +# It improves a lot performances. This is comparable +# to the Apache ServerLimit/MaxClients option. +listen = 100 + ################################## ### OpenStack service specific ### ################################## diff -Nru nova-22.0.1/debian/nova-common.config.in nova-22.2.2/debian/nova-common.config.in --- nova-22.0.1/debian/nova-common.config.in 2021-07-12 12:57:03.000000000 +0200 +++ nova-22.2.2/debian/nova-common.config.in 2021-07-09 13:16:57.000000000 +0200 @@ -83,7 +83,5 @@ # Cinder os_region_name config pkgos_read_config -p high ${NOVA_CONF} cinder os_region_name nova/cinder_os_region_name -# Glance api_servers config -pkgos_read_config -p high ${NOVA_CONF} glance api_servers nova/glance_api_servers exit 0 diff -Nru nova-22.0.1/debian/nova-common.postinst.in nova-22.2.2/debian/nova-common.postinst.in --- nova-22.0.1/debian/nova-common.postinst.in 2021-07-12 12:57:03.000000000 +0200 +++ nova-22.2.2/debian/nova-common.postinst.in 2021-07-09 13:16:57.000000000 +0200 @@ -64,13 +64,6 @@ fi } -manage_glance_api_servers () { - db_get nova/glance_api_servers - if [ -n "${RET}" ] ; then - pkgos_inifile set ${NOVA_CONF} glance api_servers ${RET} - fi -} - maintain_var_lib_nova_perms () { if [ -z "$2" ]; then # New install - blanket permissions @@ -200,7 +193,6 @@ manage_neutron_config manage_placement_config manage_cinder_os_region - manage_glance_api_servers # Needed, because in some cases, it's owned by root:root, # which makes the nova-manage db sync fail. diff -Nru nova-22.0.1/debian/nova-common.templates.in nova-22.2.2/debian/nova-common.templates.in --- nova-22.0.1/debian/nova-common.templates.in 2021-07-12 12:57:03.000000000 +0200 +++ nova-22.2.2/debian/nova-common.templates.in 2021-07-09 13:16:57.000000000 +0200 @@ -56,12 +56,6 @@ _Description: Cinder os region name: Please enter the region name for cinder service. -Template: nova/glance_api_servers -Type: string -Default: http://localhost:9292 -_Description: Glance api server URL: - Please enter the URL of the Glance api server. - Template: nova/placement_admin_tenant_name Type: string Default: service diff -Nru nova-22.0.1/debian/patches/CVE-2021-3654_Reject_open_redirection_in_the_console_proxy.patch nova-22.2.2/debian/patches/CVE-2021-3654_Reject_open_redirection_in_the_console_proxy.patch --- nova-22.0.1/debian/patches/CVE-2021-3654_Reject_open_redirection_in_the_console_proxy.patch 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/debian/patches/CVE-2021-3654_Reject_open_redirection_in_the_console_proxy.patch 2021-07-09 13:16:57.000000000 +0200 @@ -0,0 +1,150 @@ +Description: CVE-2021-3654 Reject open redirection in the console proxy + Our console proxies (novnc, serial, spice) run in a websockify server + whose request handler inherits from the python standard + SimpleHTTPRequestHandler. There is a known issue [1] in the + SimpleHTTPRequestHandler which allows open redirects by way of URLs + in the following format: + . + http://vncproxy.my.domain.com//example.com/%2F.. + . + which if visited, will redirect a user to example.com. + . + We can intercept a request and reject requests that pass a redirection + URL beginning with "//" by implementing the + SimpleHTTPRequestHandler.send_head() method containing the + vulnerability to reject such requests with a 400 Bad Request. + . + This code is copied from a patch suggested in one of the issue comments + [2]. + [1] https://bugs.python.org/issue32084 + [2] https://bugs.python.org/issue32084#msg306545 + . + Conflicts: + nova/console/websocketproxy.py + nova/tests/unit/console/test_websocketproxy.py + . + NOTE(melwitt): The conflicts are because the following changes are not + in Victoria: + . + Ib2c406327fef2fb4868d8050fc476a7d17706e23 (Remove six.moves) + I58b0382c86d4ef798572edb63d311e0e3e6937bb (Refactor and rename + test_tcp_rst_no_compute_rpcapi) +Author: melanie witt +Date: Thu, 13 May 2021 05:43:42 +0000 +Bug-Ubuntu: https://bugs.launchpad.net/nova/+bug/1927677 +Bug-Debian: https://bugs.debian.org/991441 +Change-Id: Ie36401c782f023d1d5f2623732619105dc2cfa24 +Origin: upstream, https://review.opendev.org/c/openstack/nova/+/791805 +Last-Update: 2021-07-29 + +diff --git a/nova/console/websocketproxy.py b/nova/console/websocketproxy.py +index 5f2985d..1410d64 100644 +--- a/nova/console/websocketproxy.py ++++ b/nova/console/websocketproxy.py +@@ -19,6 +19,8 @@ + ''' + + import copy ++from http import HTTPStatus ++import os + import socket + import sys + +@@ -281,6 +283,27 @@ + def socket(self, *args, **kwargs): + return websockifyserver.WebSockifyServer.socket(*args, **kwargs) + ++ def send_head(self): ++ # This code is copied from this example patch: ++ # https://bugs.python.org/issue32084#msg306545 ++ path = self.translate_path(self.path) ++ if os.path.isdir(path): ++ parts = urlparse.urlsplit(self.path) ++ if not parts.path.endswith('/'): ++ # redirect browser - doing basically what apache does ++ new_parts = (parts[0], parts[1], parts[2] + '/', ++ parts[3], parts[4]) ++ new_url = urlparse.urlunsplit(new_parts) ++ ++ # Browsers interpret "Location: //uri" as an absolute URI ++ # like "http://URI" ++ if new_url.startswith('//'): ++ self.send_error(HTTPStatus.BAD_REQUEST, ++ "URI must not start with //") ++ return None ++ ++ return super(NovaProxyRequestHandler, self).send_head() ++ + + class NovaWebSocketProxy(websockify.WebSocketProxy): + def __init__(self, *args, **kwargs): +diff --git a/nova/tests/unit/console/test_websocketproxy.py b/nova/tests/unit/console/test_websocketproxy.py +index 3c234df..4ed2d2d 100644 +--- a/nova/tests/unit/console/test_websocketproxy.py ++++ b/nova/tests/unit/console/test_websocketproxy.py +@@ -626,6 +626,40 @@ + self.wh.server.top_new_client(conn, address) + self.assertIsNone(self.wh._compute_rpcapi) + ++ def test_reject_open_redirect(self): ++ # This will test the behavior when an attempt is made to cause an open ++ # redirect. It should be rejected. ++ mock_req = mock.MagicMock() ++ mock_req.makefile().readline.side_effect = [ ++ b'GET //example.com/%2F.. HTTP/1.1\r\n', ++ b'' ++ ] ++ ++ # Collect the response data to verify at the end. The ++ # SimpleHTTPRequestHandler writes the response data by calling the ++ # request socket sendall() method. ++ self.data = b'' ++ ++ def fake_sendall(data): ++ self.data += data ++ ++ mock_req.sendall.side_effect = fake_sendall ++ ++ client_addr = ('8.8.8.8', 54321) ++ mock_server = mock.MagicMock() ++ # This specifies that the server will be able to handle requests other ++ # than only websockets. ++ mock_server.only_upgrade = False ++ ++ # Constructing a handler will process the mock_req request passed in. ++ websocketproxy.NovaProxyRequestHandler( ++ mock_req, client_addr, mock_server) ++ ++ # Verify no redirect happens and instead a 400 Bad Request is returned. ++ self.data = self.data.decode() ++ self.assertIn('Error code: 400', self.data) ++ self.assertIn('Message: URI must not start with //', self.data) ++ + @mock.patch('websockify.websocketproxy.select_ssl_version') + def test_ssl_min_version_is_not_set(self, mock_select_ssl): + websocketproxy.NovaWebSocketProxy() +diff --git a/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml b/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml +new file mode 100644 +index 0000000..ce05b9a +--- /dev/null ++++ b/releasenotes/notes/console-proxy-reject-open-redirect-4ac0a7895acca7eb.yaml +@@ -0,0 +1,19 @@ ++--- ++security: ++ - | ++ A vulnerability in the console proxies (novnc, serial, spice) that allowed ++ open redirection has been `patched`_. The novnc, serial, and spice console ++ proxies are implemented as websockify servers and the request handler ++ inherits from the python standard SimpleHTTPRequestHandler. There is a ++ `known issue`_ in the SimpleHTTPRequestHandler which allows open redirects ++ by way of URLs in the following format:: ++ ++ http://vncproxy.my.domain.com//example.com/%2F.. ++ ++ which if visited, will redirect a user to example.com. ++ ++ The novnc, serial, and spice console proxies will now reject requests that ++ pass a redirection URL beginning with "//" with a 400 Bad Request. ++ ++ .. _patched: https://bugs.launchpad.net/nova/+bug/1927677 ++ .. _known issue: https://bugs.python.org/issue32084 diff -Nru nova-22.0.1/debian/patches/series nova-22.2.2/debian/patches/series --- nova-22.0.1/debian/patches/series 2021-07-12 12:57:03.000000000 +0200 +++ nova-22.2.2/debian/patches/series 2021-07-09 13:16:57.000000000 +0200 @@ -3,3 +3,4 @@ remove-crashing-blockdiag-doc-line.patch remove-svg-converter-from-doc-conf.py.patch Add-a-healtcheck-url.patch +CVE-2021-3654_Reject_open_redirection_in_the_console_proxy.patch diff -Nru nova-22.0.1/devstack/nova-multi-cell-blacklist.txt nova-22.2.2/devstack/nova-multi-cell-blacklist.txt --- nova-22.0.1/devstack/nova-multi-cell-blacklist.txt 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/devstack/nova-multi-cell-blacklist.txt 2021-07-09 05:08:56.000000000 +0200 @@ -4,3 +4,9 @@ # Exclude tempest.scenario.test_network tests since they are slow and # only test advanced neutron features, unrelated to multi-cell testing. ^tempest.scenario.test_network + +# Also exlude resize and migrate tests with qos ports as qos is currently +# not supported in cross cell resize case . See +# https://bugs.launchpad.net/nova/+bug/1907511 for details +test_migrate_with_qos_min_bw_allocation +test_resize_with_qos_min_bw_allocation diff -Nru nova-22.0.1/doc/source/admin/configuration/cross-cell-resize.rst nova-22.2.2/doc/source/admin/configuration/cross-cell-resize.rst --- nova-22.0.1/doc/source/admin/configuration/cross-cell-resize.rst 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/doc/source/admin/configuration/cross-cell-resize.rst 2021-07-09 05:08:56.000000000 +0200 @@ -237,7 +237,8 @@ * Instances with ports attached that have :doc:`bandwidth-aware ` resource - provider allocations. + provider allocations. Nova falls back to same-cell resize if the server has + such ports. * Rescheduling to alternative hosts within the same target cell in case the primary selected host fails the ``prep_snapshot_based_resize_at_dest`` call. diff -Nru nova-22.0.1/doc/source/admin/networking.rst nova-22.2.2/doc/source/admin/networking.rst --- nova-22.0.1/doc/source/admin/networking.rst 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/doc/source/admin/networking.rst 2021-07-09 05:08:56.000000000 +0200 @@ -24,6 +24,18 @@ :neutron-doc:`OpenStack Networking service documentation ` +.. note:: + + Nova only supports PCI addresses where the fields are restricted to the + following maximum value: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 + + Nova will ignore PCI devices reported by the hypervisor if the address is + outside of these ranges. NUMA Affinity ------------- diff -Nru nova-22.0.1/doc/source/admin/pci-passthrough.rst nova-22.2.2/doc/source/admin/pci-passthrough.rst --- nova-22.0.1/doc/source/admin/pci-passthrough.rst 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/doc/source/admin/pci-passthrough.rst 2021-07-09 05:08:56.000000000 +0200 @@ -37,6 +37,18 @@ supported until the 14.0.0 Newton release, see `bug 1512800 `_ for details. +.. note:: + + Nova only supports PCI addresses where the fields are restricted to the + following maximum value: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 + + Nova will ignore PCI devices reported by the hypervisor if the address is + outside of these ranges. Configure host (Compute) ------------------------ diff -Nru nova-22.0.1/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst nova-22.2.2/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst --- nova-22.0.1/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/doc/source/admin/secure-live-migration-with-qemu-native-tls.rst 2021-07-09 05:08:56.000000000 +0200 @@ -120,10 +120,13 @@ (1) On all relevant compute nodes, enable the :oslo.config:option:`libvirt.live_migration_with_native_tls` - configuration attribute:: + configuration attribute and set the + :oslo.config:option:`libvirt.live_migration_scheme` + configuration attribute to tls:: [libvirt] live_migration_with_native_tls = true + live_migration_scheme = tls .. note:: Setting both @@ -131,6 +134,12 @@ :oslo.config:option:`libvirt.live_migration_tunnelled` at the same time is invalid (and disallowed). + .. note:: + Not setting + :oslo.config:option:`libvirt.live_migration_scheme` to ``tls`` + will result in libvirt using the unencrypted TCP connection + without displaying any error or a warning in the logs. + And restart the ``nova-compute`` service:: $ systemctl restart openstack-nova-compute diff -Nru nova-22.0.1/doc/source/cli/nova-status.rst nova-22.2.2/doc/source/cli/nova-status.rst --- nova-22.0.1/doc/source/cli/nova-status.rst 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/doc/source/cli/nova-status.rst 2021-07-09 05:08:56.000000000 +0200 @@ -142,6 +142,8 @@ **22.0.0 (Victoria)** * Checks for the policy files is not JSON-formatted. + * Checks for computes older than the previous major release. This check was + backported from 23.0.0 (Wallaby). See Also ======== diff -Nru nova-22.0.1/doc/source/contributor/ptl-guide.rst nova-22.2.2/doc/source/contributor/ptl-guide.rst --- nova-22.0.1/doc/source/contributor/ptl-guide.rst 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/doc/source/contributor/ptl-guide.rst 2021-07-09 05:08:56.000000000 +0200 @@ -257,6 +257,9 @@ * Example: https://review.opendev.org/543580 + * Bump the oldest supported compute service version + * https://review.opendev.org/#/c/738482/ + * Create the launchpad series for the next cycle * Set the development focus of the project to the new cycle series diff -Nru nova-22.0.1/doc/source/install/verify.rst nova-22.2.2/doc/source/install/verify.rst --- nova-22.0.1/doc/source/install/verify.rst 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/doc/source/install/verify.rst 2021-07-09 05:08:56.000000000 +0200 @@ -131,3 +131,7 @@ | Result: Success | | Details: None | +--------------------------------------------------------------------+ + | Check: Older than N-1 computes | + | Result: Success | + | Details: None | + +--------------------------------------------------------------------+ diff -Nru nova-22.0.1/gate/live_migration/hooks/ceph.sh nova-22.2.2/gate/live_migration/hooks/ceph.sh --- nova-22.0.1/gate/live_migration/hooks/ceph.sh 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/gate/live_migration/hooks/ceph.sh 1970-01-01 01:00:00.000000000 +0100 @@ -1,208 +0,0 @@ -#!/bin/bash - -function prepare_ceph { - git clone https://opendev.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph - source /tmp/devstack-plugin-ceph/devstack/settings - source /tmp/devstack-plugin-ceph/devstack/lib/ceph - install_ceph - configure_ceph - #install ceph-common package and additional python3 ceph libraries on compute nodes - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m raw -a "executable=/bin/bash - USE_PYTHON3=${USE_PYTHON3:-True} - source $BASE/new/devstack/functions - source $BASE/new/devstack/functions-common - git clone https://opendev.org/openstack/devstack-plugin-ceph /tmp/devstack-plugin-ceph - source /tmp/devstack-plugin-ceph/devstack/lib/ceph - install_ceph_remote - " - - #copy ceph admin keyring to compute nodes - sudo cp /etc/ceph/ceph.client.admin.keyring /tmp/ceph.client.admin.keyring - sudo chown ${STACK_USER}:${STACK_USER} /tmp/ceph.client.admin.keyring - sudo chmod 644 /tmp/ceph.client.admin.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.admin.keyring dest=/etc/ceph/ceph.client.admin.keyring owner=ceph group=ceph" - sudo rm -f /tmp/ceph.client.admin.keyring - #copy ceph.conf to compute nodes - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/etc/ceph/ceph.conf dest=/etc/ceph/ceph.conf owner=root group=root" - - start_ceph -} - -function _ceph_configure_glance { - GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf} - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${GLANCE_CEPH_POOL} ${GLANCE_CEPH_POOL_PG} ${GLANCE_CEPH_POOL_PGP} - sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${GLANCE_CEPH_USER} \ - mon "allow r" \ - osd "allow class-read object_prefix rbd_children, allow rwx pool=${GLANCE_CEPH_POOL}" | \ - sudo tee ${CEPH_CONF_DIR}/ceph.client.${GLANCE_CEPH_USER}.keyring - sudo chown ${STACK_USER}:$(id -g -n $whoami) ${CEPH_CONF_DIR}/ceph.client.${GLANCE_CEPH_USER}.keyring - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=DEFAULT option=show_image_direct_url value=True" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=default_store value=rbd" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=stores value='file, http, rbd'" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_ceph_conf value=$CEPH_CONF_FILE" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_user value=$GLANCE_CEPH_USER" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${GLANCE_API_CONF} section=glance_store option=rbd_store_pool value=$GLANCE_CEPH_POOL" - - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${GLANCE_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${GLANCE_CEPH_POOL} crush_ruleset ${RULE_ID} - fi - - #copy glance keyring to compute only node - sudo cp /etc/ceph/ceph.client.glance.keyring /tmp/ceph.client.glance.keyring - sudo chown $STACK_USER:$STACK_USER /tmp/ceph.client.glance.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.glance.keyring dest=/etc/ceph/ceph.client.glance.keyring" - sudo rm -f /tmp/ceph.client.glance.keyring -} - -function configure_and_start_glance { - _ceph_configure_glance - echo 'check processes before glance-api stop' - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep glance-api" - - # restart glance - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@g-api" - - echo 'check processes after glance-api stop' - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "ps aux | grep glance-api" -} - -function _ceph_configure_nova { - #setup ceph for nova, we don't reuse configure_ceph_nova - as we need to emulate case where cinder is not configured for ceph - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${NOVA_CEPH_POOL} ${NOVA_CEPH_POOL_PG} ${NOVA_CEPH_POOL_PGP} - NOVA_CONF=${NOVA_CPU_CONF:-/etc/nova/nova.conf} - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=rbd_user value=${CINDER_CEPH_USER}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=rbd_secret_uuid value=${CINDER_CEPH_UUID}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=inject_key value=false" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=inject_partition value=-2" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=disk_cachemodes value='network=writeback'" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_type value=rbd" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_pool value=${NOVA_CEPH_POOL}" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_ceph_conf value=${CEPH_CONF_FILE}" - - sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${CINDER_CEPH_USER} \ - mon "allow r" \ - osd "allow class-read object_prefix rbd_children, allow rwx pool=${CINDER_CEPH_POOL}, allow rwx pool=${NOVA_CEPH_POOL},allow rwx pool=${GLANCE_CEPH_POOL}" | \ - sudo tee ${CEPH_CONF_DIR}/ceph.client.${CINDER_CEPH_USER}.keyring > /dev/null - sudo chown ${STACK_USER}:$(id -g -n $whoami) ${CEPH_CONF_DIR}/ceph.client.${CINDER_CEPH_USER}.keyring - - #copy cinder keyring to compute only node - sudo cp /etc/ceph/ceph.client.cinder.keyring /tmp/ceph.client.cinder.keyring - sudo chown stack:stack /tmp/ceph.client.cinder.keyring - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/ceph.client.cinder.keyring dest=/etc/ceph/ceph.client.cinder.keyring" - sudo rm -f /tmp/ceph.client.cinder.keyring - - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${NOVA_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${NOVA_CEPH_POOL} crush_ruleset ${RULE_ID} - fi -} - -function _wait_for_nova_compute_service_state { - source $BASE/new/devstack/openrc admin admin - local status=$1 - local attempt=1 - local max_attempts=24 - local attempt_sleep=5 - local computes_count=$(openstack compute service list | grep -c nova-compute) - local computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l) - - echo "Waiting for $computes_count computes to report as $status" - while [ "$computes_ready" -ne "$computes_count" ]; do - if [ "$attempt" -eq "$max_attempts" ]; then - echo "Failed waiting for computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${max_attempts} attempts" - exit 4 - fi - echo "Waiting ${attempt_sleep} seconds for ${computes_count} computes to report as ${status}, ${computes_ready}/${computes_count} ${status} after ${attempt}/${max_attempts} attempts" - sleep $attempt_sleep - attempt=$((attempt+1)) - computes_ready=$(openstack compute service list | grep nova-compute | grep $status | wc -l) - done - echo "All computes are now reporting as ${status} after ${attempt} attempts" -} - -function configure_and_start_nova { - - echo "Checking all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute" - - # stop nova-compute - echo "Stopping all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl stop devstack@n-cpu" - - # Wait for the service to be marked as down - _wait_for_nova_compute_service_state "down" - - _ceph_configure_nova - - #import secret to libvirt - _populate_libvirt_secret - - # start nova-compute - echo "Starting all n-cpu services" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl start devstack@n-cpu" - - echo "Checking all n-cpu services" - # test that they are all running again - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "pgrep -u stack -a nova-compute" - - # Wait for the service to be marked as up - _wait_for_nova_compute_service_state "up" -} - -function _ceph_configure_cinder { - sudo ceph -c ${CEPH_CONF_FILE} osd pool create ${CINDER_CEPH_POOL} ${CINDER_CEPH_POOL_PG} ${CINDER_CEPH_POOL_PGP} - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${CINDER_CEPH_POOL} size ${CEPH_REPLICAS} - if [[ $CEPH_REPLICAS -ne 1 ]]; then - sudo ceph -c ${CEPH_CONF_FILE} osd pool set ${CINDER_CEPH_POOL} crush_ruleset ${RULE_ID} - fi - - CINDER_CONF=${CINDER_CONF:-/etc/cinder/cinder.conf} - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=volume_backend_name value=ceph" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=volume_driver value=cinder.volume.drivers.rbd.RBDDriver" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_ceph_conf value=$CEPH_CONF_FILE" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_pool value=$CINDER_CEPH_POOL" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_user value=$CINDER_CEPH_USER" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_uuid value=$CINDER_CEPH_UUID" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_flatten_volume_from_snapshot value=False" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=ceph option=rbd_max_clone_depth value=5" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=DEFAULT option=default_volume_type value=ceph" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$CINDER_CONF section=DEFAULT option=enabled_backends value=ceph" - -} - -function configure_and_start_cinder { - _ceph_configure_cinder - - # restart cinder - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "systemctl restart devstack@c-vol" - - source $BASE/new/devstack/openrc - - export OS_USERNAME=admin - export OS_PROJECT_NAME=admin - lvm_type=$(cinder type-list | awk -F "|" 'NR==4{ print $2}') - cinder type-delete $lvm_type - openstack volume type create --os-volume-api-version 1 --property volume_backend_name="ceph" ceph -} - -function _populate_libvirt_secret { - cat > /tmp/secret.xml < - ${CINDER_CEPH_UUID} - - client.${CINDER_CEPH_USER} secret - - -EOF - - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m copy -a "src=/tmp/secret.xml dest=/tmp/secret.xml" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "virsh secret-define --file /tmp/secret.xml" - local secret=$(sudo ceph -c ${CEPH_CONF_FILE} auth get-key client.${CINDER_CEPH_USER}) - # TODO(tdurakov): remove this escaping as https://github.com/ansible/ansible/issues/13862 fixed - secret=${secret//=/'\='} - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "virsh secret-set-value --secret ${CINDER_CEPH_UUID} --base64 $secret" - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m file -a "path=/tmp/secret.xml state=absent" - -} diff -Nru nova-22.0.1/gate/live_migration/hooks/nfs.sh nova-22.2.2/gate/live_migration/hooks/nfs.sh --- nova-22.0.1/gate/live_migration/hooks/nfs.sh 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/gate/live_migration/hooks/nfs.sh 1970-01-01 01:00:00.000000000 +0100 @@ -1,50 +0,0 @@ -#!/bin/bash - -function nfs_setup { - if uses_debs; then - module=apt - elif is_fedora; then - module=yum - fi - $ANSIBLE all --become -f 5 -i "$WORKSPACE/inventory" -m $module \ - -a "name=nfs-common state=present" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m $module \ - -a "name=nfs-kernel-server state=present" - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=/etc/idmapd.conf section=Mapping option=Nobody-User value=nova" - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=/etc/idmapd.conf section=Mapping option=Nobody-Group value=nova" - - for SUBNODE in $SUBNODES ; do - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m lineinfile -a "dest=/etc/exports line='/opt/stack/data/nova/instances $SUBNODE(rw,fsid=0,insecure,no_subtree_check,async,no_root_squash)'" - done - - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "exportfs -a" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=nfs-kernel-server state=restarted" - GetDistro - if [[ ! ${DISTRO} =~ (xenial) ]]; then - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=idmapd state=restarted" - fi - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p tcp --dport 111 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p udp --dport 111 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p tcp --dport 2049 -j ACCEPT" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "iptables -A INPUT -p udp --dport 2049 -j ACCEPT" - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "mount -t nfs4 -o proto\=tcp,port\=2049 $primary_node:/ /opt/stack/data/nova/instances/" -} - -function nfs_configure_tempest { - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$BASE/new/tempest/etc/tempest.conf section=compute-feature-enabled option=block_migration_for_live_migration value=False" -} - -function nfs_verify_setup { - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m file -a "path=/opt/stack/data/nova/instances/test_file state=touch" - if [ ! -e '/opt/stack/data/nova/instances/test_file' ]; then - die $LINENO "NFS configuration failure" - fi -} - -function nfs_teardown { - #teardown nfs shared storage - $ANSIBLE subnodes --become -f 5 -i "$WORKSPACE/inventory" -m shell -a "umount -t nfs4 /opt/stack/data/nova/instances/" - $ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m service -a "name=nfs-kernel-server state=stopped" -} \ No newline at end of file diff -Nru nova-22.0.1/gate/live_migration/hooks/run_tests.sh nova-22.2.2/gate/live_migration/hooks/run_tests.sh --- nova-22.0.1/gate/live_migration/hooks/run_tests.sh 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/gate/live_migration/hooks/run_tests.sh 1970-01-01 01:00:00.000000000 +0100 @@ -1,71 +0,0 @@ -#!/bin/bash -# Live migration dedicated ci job will be responsible for testing different -# environments based on underlying storage, used for ephemerals. -# This hook allows to inject logic of environment reconfiguration in ci job. -# Base scenario for this would be: -# -# 1. test with all local storage (use default for volumes) -# 2. test with NFS for root + ephemeral disks -# 3. test with Ceph for root + ephemeral disks -# 4. test with Ceph for volumes and root + ephemeral disk - -set -xe -cd $BASE/new/tempest - -source $BASE/new/devstack/functions -source $BASE/new/devstack/functions-common -source $BASE/new/devstack/lib/nova -source $WORKSPACE/devstack-gate/functions.sh -source $BASE/new/nova/gate/live_migration/hooks/utils.sh -source $BASE/new/nova/gate/live_migration/hooks/nfs.sh -source $BASE/new/nova/gate/live_migration/hooks/ceph.sh -primary_node=$(cat /etc/nodepool/primary_node_private) -SUBNODES=$(cat /etc/nodepool/sub_nodes_private) -SERVICE_HOST=$primary_node -STACK_USER=${STACK_USER:-stack} - -echo '1. test with all local storage (use default for volumes)' -echo 'NOTE: test_volume_backed_live_migration is skipped due to https://bugs.launchpad.net/nova/+bug/1524898' -run_tempest "block migration test" "^.*test_live_migration(?!.*(test_volume_backed_live_migration))" - -# TODO(mriedem): Run $BASE/new/nova/gate/test_evacuate.sh for local storage - -#all tests bellow this line use shared storage, need to update tempest.conf -echo 'disabling block_migration in tempest' -$ANSIBLE primary --become -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=$BASE/new/tempest/etc/tempest.conf section=compute-feature-enabled option=block_migration_for_live_migration value=False" - -echo '2. NFS testing is skipped due to setup failures with Ubuntu 16.04' -#echo '2. test with NFS for root + ephemeral disks' - -#nfs_setup -#nfs_configure_tempest -#nfs_verify_setup -#run_tempest "NFS shared storage test" "live_migration" -#nfs_teardown - -# The nova-grenade-multinode job also runs resize and cold migration tests -# so we check for a grenade-only variable. -if [[ -n "$GRENADE_NEW_BRANCH" ]]; then - echo '3. test cold migration and resize' - run_tempest "cold migration and resize test" "test_resize_server|test_cold_migration|test_revert_cold_migration" -else - echo '3. cold migration and resize is skipped for non-grenade jobs' -fi - -echo '4. test with Ceph for root + ephemeral disks' -# Discover and set variables for the OS version so the devstack-plugin-ceph -# scripts can find the correct repository to install the ceph packages. -GetOSVersion -USE_PYTHON3=${USE_PYTHON3:-True} -prepare_ceph -GLANCE_API_CONF=${GLANCE_API_CONF:-/etc/glance/glance-api.conf} -configure_and_start_glance - -configure_and_start_nova -run_tempest "Ceph nova&glance test" "^.*test_live_migration(?!.*(test_volume_backed_live_migration))" - -set +e -#echo '5. test with Ceph for volumes and root + ephemeral disk' - -#configure_and_start_cinder -#run_tempest "Ceph nova&glance&cinder test" "live_migration" diff -Nru nova-22.0.1/gate/live_migration/hooks/utils.sh nova-22.2.2/gate/live_migration/hooks/utils.sh --- nova-22.0.1/gate/live_migration/hooks/utils.sh 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/gate/live_migration/hooks/utils.sh 1970-01-01 01:00:00.000000000 +0100 @@ -1,11 +0,0 @@ -#!/bin/bash - -function run_tempest { - local message=$1 - local tempest_regex=$2 - sudo -H -u tempest tox -eall -- $tempest_regex --concurrency=$TEMPEST_CONCURRENCY - exitcode=$? - if [[ $exitcode -ne 0 ]]; then - die $LINENO "$message failure" - fi -} diff -Nru nova-22.0.1/lower-constraints.txt nova-22.2.2/lower-constraints.txt --- nova-22.0.1/lower-constraints.txt 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/lower-constraints.txt 2021-07-09 05:08:56.000000000 +0200 @@ -90,7 +90,7 @@ osprofiler==1.4.0 ovs==2.10.0 ovsdbapp==0.15.0 -packaging==17.1 +packaging==20.4 paramiko==2.7.1 Paste==2.0.2 PasteDeploy==1.5.0 diff -Nru nova-22.0.1/nova/api/metadata/base.py nova-22.2.2/nova/api/metadata/base.py --- nova-22.0.1/nova/api/metadata/base.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/api/metadata/base.py 2021-07-09 05:08:56.000000000 +0200 @@ -124,8 +124,13 @@ if not content: content = [] + # NOTE(gibi): this is not a cell targeted context even if we are called + # in a situation when the instance is in a different cell than the + # metadata service itself. ctxt = context.get_admin_context() + self.mappings = _format_instance_mapping(instance) + # NOTE(danms): Sanitize the instance to limit the amount of stuff # inside that may not pickle well (i.e. context). We also touch # some of the things we'll lazy load later to make sure we keep their @@ -147,8 +152,6 @@ self.security_groups = security_group_api.get_instance_security_groups( ctxt, instance) - self.mappings = _format_instance_mapping(ctxt, instance) - if instance.user_data is not None: self.userdata_raw = base64.decode_as_bytes(instance.user_data) else: @@ -696,9 +699,8 @@ return InstanceMetadata(instance, address) -def _format_instance_mapping(ctxt, instance): - bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( - ctxt, instance.uuid) +def _format_instance_mapping(instance): + bdms = instance.get_bdms() return block_device.instance_block_mapping(instance, bdms) diff -Nru nova-22.0.1/nova/api/openstack/compute/admin_actions.py nova-22.2.2/nova/api/openstack/compute/admin_actions.py --- nova-22.0.1/nova/api/openstack/compute/admin_actions.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/api/openstack/compute/admin_actions.py 2021-07-09 05:08:56.000000000 +0200 @@ -19,8 +19,10 @@ from nova.api.openstack import wsgi from nova.api import validation from nova.compute import api as compute +from nova.compute import instance_actions from nova.compute import vm_states from nova import exception +from nova import objects from nova.policies import admin_actions as aa_policies # States usable in resetState action @@ -73,9 +75,14 @@ context.can(aa_policies.POLICY_ROOT % 'reset_state', target={'project_id': instance.project_id}) + # Log os-resetState as an instance action + instance_action = objects.InstanceAction.action_start( + context, instance.uuid, instance_actions.RESET_STATE) + # Identify the desired state from the body state = state_map[body["os-resetState"]["state"]] instance.vm_state = state instance.task_state = None instance.save(admin_state_reset=True) + instance_action.finish() diff -Nru nova-22.0.1/nova/api/openstack/wsgi_app.py nova-22.2.2/nova/api/openstack/wsgi_app.py --- nova-22.0.1/nova/api/openstack/wsgi_app.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/api/openstack/wsgi_app.py 2021-07-09 05:08:56.000000000 +0200 @@ -23,11 +23,14 @@ from nova import exception from nova import objects from nova import service +from nova import utils CONF = cfg.CONF CONFIG_FILES = ['api-paste.ini', 'nova.conf'] +LOG = logging.getLogger(__name__) + objects.register_all() @@ -40,6 +43,11 @@ def _setup_service(host, name): + try: + utils.raise_if_old_compute() + except exception.TooOldComputeService as e: + logging.getLogger(__name__).warning(str(e)) + binary = name if name.startswith('nova-') else "nova-%s" % name ctxt = context.get_admin_context() @@ -71,8 +79,12 @@ return application -def init_application(name): - conf_files = _get_config_files() +@utils.run_once('Global data already initialized, not re-initializing.', + LOG.info) +def init_global_data(conf_files): + # NOTE(melwitt): parse_args initializes logging and calls global rpc.init() + # and db_api.configure(). The db_api.configure() call does not initiate any + # connection to the database. config.parse_args([], default_config_files=conf_files) logging.setup(CONF, "nova") @@ -87,11 +99,25 @@ logging.getLogger(__name__), logging.DEBUG) + +def init_application(name): + conf_files = _get_config_files() + + # NOTE(melwitt): The init_application method can be called multiple times + # within a single python interpreter instance if any exception is raised + # during it (example: DBConnectionError while setting up the service) and + # apache/mod_wsgi reloads the init_application script. So, we initialize + # global data separately and decorate the method to run only once in a + # python interpreter instance. + init_global_data(conf_files) + try: _setup_service(CONF.host, name) except exception.ServiceTooOld as exc: return error_application(exc, name) + # This global init is safe because if we got here, we already successfully + # set up the service and setting up the profile cannot fail. service.setup_profiler(name, CONF.host) conf = conf_files[0] diff -Nru nova-22.0.1/nova/cmd/status.py nova-22.2.2/nova/cmd/status.py --- nova-22.0.1/nova/cmd/status.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/cmd/status.py 2021-07-09 05:08:56.000000000 +0200 @@ -427,6 +427,16 @@ status = upgradecheck.Result(upgradecheck.Code.FAILURE, msg) return status + def _check_old_computes(self): + # warn if there are computes in the system older than the previous + # major release + try: + utils.raise_if_old_compute() + except exception.TooOldComputeService as e: + return upgradecheck.Result(upgradecheck.Code.WARNING, str(e)) + + return upgradecheck.Result(upgradecheck.Code.SUCCESS) + # The format of the check functions is to return an upgradecheck.Result # object with the appropriate upgradecheck.Code and details set. If the # check hits warnings or failures then those should be stored in the @@ -447,6 +457,8 @@ (_('Policy Scope-based Defaults'), _check_policy), # Added in Victoria (_('Policy File JSON to YAML Migration'), _check_policy_json), + # Backported from Wallaby + (_('Older than N-1 computes'), _check_old_computes) ) diff -Nru nova-22.0.1/nova/compute/api.py nova-22.2.2/nova/compute/api.py --- nova-22.0.1/nova/compute/api.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/compute/api.py 2021-07-09 05:08:56.000000000 +0200 @@ -2179,22 +2179,22 @@ return True return False - def _local_delete_cleanup(self, context, instance): + def _local_delete_cleanup(self, context, instance_uuid): # NOTE(aarents) Ensure instance allocation is cleared and instance # mapping queued as deleted before _delete() return try: self.placementclient.delete_allocation_for_instance( - context, instance.uuid) + context, instance_uuid) except exception.AllocationDeleteFailed: LOG.info("Allocation delete failed during local delete cleanup.", - instance=instance) + instance_uuid=instance_uuid) try: - self._update_queued_for_deletion(context, instance, True) + self._update_queued_for_deletion(context, instance_uuid, True) except exception.InstanceMappingNotFound: LOG.info("Instance Mapping does not exist while attempting " "local delete cleanup.", - instance=instance) + instance_uuid=instance_uuid) def _attempt_delete_of_buildrequest(self, context, instance): # If there is a BuildRequest then the instance may not have been @@ -2231,7 +2231,7 @@ if not instance.host and not may_have_ports_or_volumes: try: if self._delete_while_booting(context, instance): - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance.uuid) return # If instance.host was not set it's possible that the Instance # object here was pulled from a BuildRequest object and is not @@ -2240,6 +2240,11 @@ # properly. A lookup is attempted which will either return a # full Instance or None if not found. If not found then it's # acceptable to skip the rest of the delete processing. + + # Save a copy of the instance UUID early, in case + # _lookup_instance returns instance = None, to pass to + # _local_delete_cleanup if needed. + instance_uuid = instance.uuid cell, instance = self._lookup_instance(context, instance.uuid) if cell and instance: try: @@ -2250,11 +2255,11 @@ except exception.InstanceNotFound: pass # The instance was deleted or is already gone. - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance.uuid) return if not instance: # Instance is already deleted. - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance_uuid) return except exception.ObjectActionError: # NOTE(melwitt): This means the instance.host changed @@ -2267,7 +2272,7 @@ cell, instance = self._lookup_instance(context, instance.uuid) if not instance: # Instance is already deleted - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance_uuid) return bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( @@ -2311,7 +2316,7 @@ 'field, its vm_state is %(state)s.', {'state': instance.vm_state}, instance=instance) - self._local_delete_cleanup(context, instance) + self._local_delete_cleanup(context, instance.uuid) return except exception.ObjectActionError as ex: # The instance's host likely changed under us as @@ -2496,7 +2501,7 @@ instance.destroy() @staticmethod - def _update_queued_for_deletion(context, instance, qfd): + def _update_queued_for_deletion(context, instance_uuid, qfd): # NOTE(tssurya): We query the instance_mapping record of this instance # and update the queued_for_delete flag to True (or False according to # the state of the instance). This just means that the instance is @@ -2505,7 +2510,7 @@ # value could be stale which is fine, considering its use is only # during down cell (desperate) situation. im = objects.InstanceMapping.get_by_instance_uuid(context, - instance.uuid) + instance_uuid) im.queued_for_delete = qfd im.save() @@ -2517,7 +2522,7 @@ instance.save() else: self.compute_rpcapi.terminate_instance(context, instance, bdms) - self._update_queued_for_deletion(context, instance, True) + self._update_queued_for_deletion(context, instance.uuid, True) def _do_soft_delete(self, context, instance, bdms, local=False): if local: @@ -2527,7 +2532,7 @@ instance.save() else: self.compute_rpcapi.soft_delete_instance(context, instance) - self._update_queued_for_deletion(context, instance, True) + self._update_queued_for_deletion(context, instance.uuid, True) # NOTE(maoy): we allow delete to be called no matter what vm_state says. @check_instance_lock @@ -2580,7 +2585,7 @@ instance.task_state = None instance.deleted_at = None instance.save(expected_task_state=[None]) - self._update_queued_for_deletion(context, instance, False) + self._update_queued_for_deletion(context, instance.uuid, False) @check_instance_lock @check_instance_state(task_state=None, @@ -3870,8 +3875,7 @@ migration, migration.source_compute) - @staticmethod - def _allow_cross_cell_resize(context, instance): + def _allow_cross_cell_resize(self, context, instance): """Determine if the request can perform a cross-cell resize on this instance. @@ -3901,7 +3905,17 @@ 'version in the deployment %s is less than %s so ' 'cross-cell resize is not allowed at this time.', min_compute_version, MIN_COMPUTE_CROSS_CELL_RESIZE) - allowed = False + return False + + if self.network_api.get_requested_resource_for_instance( + context, instance.uuid): + LOG.info( + 'Request is allowed by policy to perform cross-cell ' + 'resize but the instance has ports with resource request ' + 'and cross-cell resize is not supported with such ports.', + instance=instance) + return False + return allowed @staticmethod diff -Nru nova-22.0.1/nova/compute/instance_actions.py nova-22.2.2/nova/compute/instance_actions.py --- nova-22.0.1/nova/compute/instance_actions.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/compute/instance_actions.py 2021-07-09 05:08:56.000000000 +0200 @@ -70,3 +70,4 @@ UNLOCK = 'unlock' BACKUP = 'createBackup' CREATE_IMAGE = 'createImage' +RESET_STATE = 'resetState' diff -Nru nova-22.0.1/nova/compute/manager.py nova-22.2.2/nova/compute/manager.py --- nova-22.0.1/nova/compute/manager.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/compute/manager.py 2021-07-09 05:08:56.000000000 +0200 @@ -1411,6 +1411,13 @@ eventlet.semaphore.BoundedSemaphore( CONF.compute.max_concurrent_disk_ops) + if CONF.compute.max_disk_devices_to_attach == 0: + msg = _('[compute]max_disk_devices_to_attach has been set to 0, ' + 'which will prevent instances from being able to boot. ' + 'Set -1 for unlimited or set >= 1 to limit the maximum ' + 'number of disk devices.') + raise exception.InvalidConfiguration(msg) + self.driver.init_host(host=self.host) context = nova.context.get_admin_context() instances = objects.InstanceList.get_by_host( @@ -1639,7 +1646,11 @@ return [_decode(f) for f in injected_files] def _validate_instance_group_policy(self, context, instance, - scheduler_hints): + scheduler_hints=None): + + if CONF.workarounds.disable_group_policy_check_upcall: + return + # NOTE(russellb) Instance group policy is enforced by the scheduler. # However, there is a race condition with the enforcement of # the policy. Since more than one instance may be scheduled at the @@ -1648,29 +1659,63 @@ # multiple instances with an affinity policy could end up on different # hosts. This is a validation step to make sure that starting the # instance here doesn't violate the policy. - group_hint = scheduler_hints.get('group') - if not group_hint: - return + if scheduler_hints is not None: + # only go through here if scheduler_hints is provided, even if it + # is empty. + group_hint = scheduler_hints.get('group') + if not group_hint: + return + else: + # The RequestSpec stores scheduler_hints as key=list pairs so + # we need to check the type on the value and pull the single + # entry out. The API request schema validates that + # the 'group' hint is a single value. + if isinstance(group_hint, list): + group_hint = group_hint[0] + + group = objects.InstanceGroup.get_by_hint(context, group_hint) + else: + # TODO(ganso): a call to DB can be saved by adding request_spec + # to rpcapi payload of live_migration, pre_live_migration and + # check_can_live_migrate_destination + try: + group = objects.InstanceGroup.get_by_instance_uuid( + context, instance.uuid) + except exception.InstanceGroupNotFound: + return - # The RequestSpec stores scheduler_hints as key=list pairs so we need - # to check the type on the value and pull the single entry out. The - # API request schema validates that the 'group' hint is a single value. - if isinstance(group_hint, list): - group_hint = group_hint[0] - - @utils.synchronized(group_hint) - def _do_validation(context, instance, group_hint): - group = objects.InstanceGroup.get_by_hint(context, group_hint) + @utils.synchronized(group['uuid']) + def _do_validation(context, instance, group): if group.policy and 'anti-affinity' == group.policy: + + # instances on host instances_uuids = objects.InstanceList.get_uuids_by_host( context, self.host) ins_on_host = set(instances_uuids) + + # instance param is just for logging, the nodename obtained is + # not actually related to the instance at all + nodename = self._get_nodename(instance) + + # instances being migrated to host + migrations = ( + objects.MigrationList.get_in_progress_by_host_and_node( + context, self.host, nodename)) + migration_vm_uuids = set([mig['instance_uuid'] + for mig in migrations]) + + total_instances = migration_vm_uuids | ins_on_host + + # refresh group to get updated members within locked block + group = objects.InstanceGroup.get_by_uuid(context, + group['uuid']) members = set(group.members) # Determine the set of instance group members on this host # which are not the instance in question. This is used to # determine how many other members from the same anti-affinity # group can be on this host. - members_on_host = ins_on_host & members - set([instance.uuid]) + members_on_host = (total_instances & members - + set([instance.uuid])) rules = group.rules if rules and 'max_server_per_host' in rules: max_server = rules['max_server_per_host'] @@ -1682,6 +1727,12 @@ raise exception.RescheduledException( instance_uuid=instance.uuid, reason=msg) + + # NOTE(ganso): The check for affinity below does not work and it + # can easily be violated because the lock happens in different + # compute hosts. + # The only fix seems to be a DB lock to perform the check whenever + # setting the host field to an instance. elif group.policy and 'affinity' == group.policy: group_hosts = group.get_hosts(exclude=[instance.uuid]) if group_hosts and self.host not in group_hosts: @@ -1690,8 +1741,7 @@ instance_uuid=instance.uuid, reason=msg) - if not CONF.workarounds.disable_group_policy_check_upcall: - _do_validation(context, instance, group_hint) + _do_validation(context, instance, group) def _log_original_error(self, exc_info, instance_uuid): LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid, @@ -2230,8 +2280,6 @@ instance=instance) self._cleanup_allocated_networks(context, instance, requested_networks) - self._cleanup_volumes(context, instance, - block_device_mapping, raise_exc=False) compute_utils.add_instance_fault_from_exc(context, instance, e, sys.exc_info(), fault_message=e.kwargs['reason']) @@ -3472,19 +3520,11 @@ self._notify_instance_rebuild_error(context, instance, e, bdms) raise else: - instance.apply_migration_context() - # NOTE (ndipanov): This save will now update the host and node - # attributes making sure that next RT pass is consistent since - # it will be based on the instance and not the migration DB - # entry. - instance.host = self.host - instance.node = scheduled_node - instance.save() - instance.drop_migration_context() - - # NOTE (ndipanov): Mark the migration as done only after we - # mark the instance as belonging to this host. - self._set_migration_status(migration, 'done') + # NOTE(gibi): Let the resource tracker set the instance + # host and drop the migration context as we need to hold the + # COMPUTE_RESOURCE_SEMAPHORE to avoid the race with + # _update_available_resources. See bug 1896463. + self.rt.finish_evacuation(instance, scheduled_node, migration) def _do_rebuild_instance_with_claim( self, context, instance, orig_image_ref, image_meta, @@ -5220,10 +5260,24 @@ with self._error_out_instance_on_exception( context, instance, instance_state=instance_state),\ errors_out_migration_ctxt(migration): + self._send_prep_resize_notifications( context, instance, fields.NotificationPhase.START, instance_type) try: + scheduler_hints = self._get_scheduler_hints(filter_properties, + request_spec) + # Error out if this host cannot accept the new instance due + # to anti-affinity. At this point the migration is already + # in-progress, so this is the definitive moment to abort due to + # the policy violation. Also, exploding here is covered by the + # cleanup methods in except block. + try: + self._validate_instance_group_policy(context, instance, + scheduler_hints) + except exception.RescheduledException as e: + raise exception.InstanceFaultRollback(inner_exception=e) + self._prep_resize(context, image, instance, instance_type, filter_properties, node, migration, request_spec, @@ -7322,9 +7376,33 @@ @wrap_instance_fault def swap_volume(self, context, old_volume_id, new_volume_id, instance, new_attachment_id): - """Swap volume for an instance.""" - context = context.elevated() + """Replace the old volume with the new volume within the active server + :param context: User request context + :param old_volume_id: Original volume id + :param new_volume_id: New volume id being swapped to + :param instance: Instance with original_volume_id attached + :param new_attachment_id: ID of the new attachment for new_volume_id + """ + @utils.synchronized(instance.uuid) + def _do_locked_swap_volume(context, old_volume_id, new_volume_id, + instance, new_attachment_id): + self._do_swap_volume(context, old_volume_id, new_volume_id, + instance, new_attachment_id) + _do_locked_swap_volume(context, old_volume_id, new_volume_id, instance, + new_attachment_id) + + def _do_swap_volume(self, context, old_volume_id, new_volume_id, + instance, new_attachment_id): + """Replace the old volume with the new volume within the active server + + :param context: User request context + :param old_volume_id: Original volume id + :param new_volume_id: New volume id being swapped to + :param instance: Instance with original_volume_id attached + :param new_attachment_id: ID of the new attachment for new_volume_id + """ + context = context.elevated() compute_utils.notify_about_volume_swap( context, instance, self.host, fields.NotificationPhase.START, @@ -7802,6 +7880,20 @@ :param limits: objects.SchedulerLimits object for this live migration. :returns: a LiveMigrateData object (hypervisor-dependent) """ + + # Error out if this host cannot accept the new instance due + # to anti-affinity. This check at this moment is not very accurate, as + # multiple requests may be happening concurrently and miss the lock, + # but when it works it provides a better user experience by failing + # earlier. Also, it should be safe to explode here, error becomes + # NoValidHost and instance status remains ACTIVE. + try: + self._validate_instance_group_policy(ctxt, instance) + except exception.RescheduledException as e: + msg = ("Failed to validate instance group policy " + "due to: {}".format(e)) + raise exception.MigrationPreCheckError(reason=msg) + src_compute_info = obj_base.obj_to_primitive( self._get_compute_info(ctxt, instance.host)) dst_compute_info = obj_base.obj_to_primitive( @@ -7944,6 +8036,13 @@ """ LOG.debug('pre_live_migration data is %s', migrate_data) + # Error out if this host cannot accept the new instance due + # to anti-affinity. At this point the migration is already in-progress, + # so this is the definitive moment to abort due to the policy + # violation. Also, it should be safe to explode here. The instance + # status remains ACTIVE, migration status failed. + self._validate_instance_group_policy(context, instance) + migrate_data.old_vol_attachment_ids = {} bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) @@ -8061,8 +8160,8 @@ # We don't generate events if CONF.vif_plugging_timeout=0 # meaning that the operator disabled using them. if CONF.vif_plugging_timeout: - return [('network-vif-plugged', vif['id']) - for vif in instance.get_network_info()] + return (instance.get_network_info() + .get_live_migration_plug_time_events()) else: return [] diff -Nru nova-22.0.1/nova/compute/resource_tracker.py nova-22.2.2/nova/compute/resource_tracker.py --- nova-22.0.1/nova/compute/resource_tracker.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/compute/resource_tracker.py 2021-07-09 05:08:56.000000000 +0200 @@ -1970,3 +1970,21 @@ """ self.pci_tracker.free_instance_claims(context, instance) self.pci_tracker.save(context) + + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def finish_evacuation(self, instance, node, migration): + instance.apply_migration_context() + # NOTE (ndipanov): This save will now update the host and node + # attributes making sure that next RT pass is consistent since + # it will be based on the instance and not the migration DB + # entry. + instance.host = self.host + instance.node = node + instance.save() + instance.drop_migration_context() + + # NOTE (ndipanov): Mark the migration as done only after we + # mark the instance as belonging to this host. + if migration: + migration.status = 'done' + migration.save() diff -Nru nova-22.0.1/nova/conf/compute.py nova-22.2.2/nova/conf/compute.py --- nova-22.0.1/nova/conf/compute.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/conf/compute.py 2021-07-09 05:08:56.000000000 +0200 @@ -962,10 +962,16 @@ The configured maximum is not enforced on shelved offloaded servers, as they have no compute host. +.. warning:: If this option is set to 0, the ``nova-compute`` service will fail + to start, as 0 disk devices is an invalid configuration that would + prevent instances from being able to boot. + Possible values: * -1 means unlimited -* Any integer >= 0 represents the maximum allowed +* Any integer >= 1 represents the maximum allowed. A value of 0 will cause the + ``nova-compute`` service to fail to start, as 0 disk devices is an invalid + configuration that would prevent instances from being able to boot. """), cfg.StrOpt('provider_config_location', default='/etc/nova/provider_config/', diff -Nru nova-22.0.1/nova/conf/pci.py nova-22.2.2/nova/conf/pci.py --- nova-22.0.1/nova/conf/pci.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/conf/pci.py 2021-07-09 05:08:56.000000000 +0200 @@ -116,7 +116,13 @@ ``address`` PCI address of the device. Both traditional glob style and regular - expression syntax is supported. + expression syntax is supported. Please note that the address fields are + restricted to the following maximum values: + + * domain - 0xFFFF + * bus - 0xFF + * slot - 0x1F + * function - 0x7 ``devname`` Device name of the device (for e.g. interface name). Not all PCI devices diff -Nru nova-22.0.1/nova/db/sqlalchemy/api.py nova-22.2.2/nova/db/sqlalchemy/api.py --- nova-22.0.1/nova/db/sqlalchemy/api.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/db/sqlalchemy/api.py 2021-07-09 05:08:56.000000000 +0200 @@ -47,6 +47,7 @@ from sqlalchemy.orm import aliased from sqlalchemy.orm import joinedload from sqlalchemy.orm import noload +from sqlalchemy.orm import subqueryload from sqlalchemy.orm import undefer from sqlalchemy.schema import Table from sqlalchemy import sql @@ -1266,13 +1267,27 @@ continue if 'extra.' in column: query = query.options(undefer(column)) + elif column in ['metadata', 'system_metadata']: + # NOTE(melwitt): We use subqueryload() instead of joinedload() for + # metadata and system_metadata because of the one-to-many + # relationship of the data. Directly joining these columns can + # result in a large number of additional rows being queried if an + # instance has a large number of (system_)metadata items, resulting + # in a large data transfer. Instead, the subqueryload() will + # perform additional queries to obtain metadata and system_metadata + # for the instance. + query = query.options(subqueryload(column)) else: query = query.options(joinedload(column)) # NOTE(alaski) Stop lazy loading of columns not needed. for col in ['metadata', 'system_metadata']: if col not in columns_to_join: query = query.options(noload(col)) - return query + # NOTE(melwitt): We need to use order_by() so that the + # additional queries emitted by subqueryload() include the same ordering as + # used by the parent query. + # https://docs.sqlalchemy.org/en/13/orm/loading_relationships.html#the-importance-of-ordering + return query.order_by(models.Instance.id) def _instances_fill_metadata(context, instances, manual_joins=None): @@ -4083,64 +4098,132 @@ ################## -def _archive_if_instance_deleted(table, shadow_table, instances, conn, - max_rows, before): - """Look for records that pertain to deleted instances, but may not be - deleted themselves. This catches cases where we delete an instance, - but leave some residue because of a failure in a cleanup path or - similar. - - Logic is: if I have a column called instance_uuid, and that instance - is deleted, then I can be deleted. - """ - - # NOTE(jake): handle instance_actions_events differently as it relies on - # instance_actions.id not instances.uuid - if table.name == "instance_actions_events": - instance_actions = models.BASE.metadata.tables["instance_actions"] - query_select = sql.select( - [table], - and_(instances.c.deleted != instances.c.deleted.default.arg, - instances.c.uuid == instance_actions.c.instance_uuid, - instance_actions.c.id == table.c.action_id)) - - else: - query_select = sql.select( - [table], - and_(instances.c.deleted != instances.c.deleted.default.arg, - instances.c.uuid == table.c.instance_uuid)) +def _get_tables_with_fk_to_table(table): + """Get a list of tables that refer to the given table by foreign key (FK). - if before: - query_select = query_select.where(instances.c.deleted_at < before) + :param table: Table object (parent) for which to find references by FK - query_select = query_select.order_by(table.c.id).limit(max_rows) + :returns: A list of Table objects that refer to the specified table by FK + """ + tables = [] + for t in models.BASE.metadata.tables.values(): + for fk in t.foreign_keys: + if fk.references(table): + tables.append(t) + return tables + + +def _get_fk_stmts(metadata, conn, table, column, records): + """Find records related to this table by foreign key (FK) and create and + return insert/delete statements for them. + + Logic is: find the tables that reference the table passed to this method + and walk the tree of references by FK. As child records are found, prepend + them to deques to execute later in a single database transaction (to avoid + orphaning related records if any one insert/delete fails or the archive + process is otherwise interrupted). + + :param metadata: Metadata object to use to construct a shadow Table object + :param conn: Connection object to use to select records related by FK + :param table: Table object (parent) for which to find references by FK + :param column: Column object (parent) to use to select records related by + FK + :param records: A list of records (column values) to use to select records + related by FK + + :returns: tuple of (insert statements, delete statements) for records + related by FK to insert into shadow tables and delete from main tables + """ + inserts = collections.deque() + deletes = collections.deque() + fk_tables = _get_tables_with_fk_to_table(table) + for fk_table in fk_tables: + # Create the shadow table for the referencing table. + fk_shadow_tablename = _SHADOW_TABLE_PREFIX + fk_table.name + try: + fk_shadow_table = Table(fk_shadow_tablename, metadata, + autoload=True) + except NoSuchTableError: + # No corresponding shadow table; skip it. + continue - query_insert = shadow_table.insert(inline=True).\ - from_select([c.name for c in table.c], query_select) + # TODO(stephenfin): Drop this when we drop the table + if fk_table.name == "dns_domains": + # We have one table (dns_domains) where the key is called + # "domain" rather than "id" + fk_column = fk_table.c.domain + else: + fk_column = fk_table.c.id - delete_statement = DeleteFromSelect(table, query_select, - table.c.id) + for fk in fk_table.foreign_keys: + # We need to find the records in the referring (child) table that + # correspond to the records in our (parent) table so we can archive + # them. + + # First, select the column in the parent referenced by the child + # table that corresponds to the parent table records that were + # passed in. + # Example: table = 'instances' and fk_table = 'instance_extra' + # fk.parent = instance_extra.instance_uuid + # fk.column = instances.uuid + # SELECT instances.uuid FROM instances, instance_extra + # WHERE instance_extra.instance_uuid = instances.uuid + # AND instance.id IN () + # We need the instance uuids for the in order to + # look up the matching instance_extra records. + select = sql.select([fk.column]).where( + sql.and_(fk.parent == fk.column, column.in_(records))) + rows = conn.execute(select).fetchall() + p_records = [r[0] for r in rows] + # Then, select rows in the child table that correspond to the + # parent table records that were passed in. + # Example: table = 'instances' and fk_table = 'instance_extra' + # fk.parent = instance_extra.instance_uuid + # fk.column = instances.uuid + # SELECT instance_extra.id FROM instance_extra, instances + # WHERE instance_extra.instance_uuid = instances.uuid + # AND instances.uuid IN () + # We will get the instance_extra ids we need to archive + # them. + fk_select = sql.select([fk_column]).where( + sql.and_(fk.parent == fk.column, fk.column.in_(p_records))) + fk_rows = conn.execute(fk_select).fetchall() + fk_records = [r[0] for r in fk_rows] + if fk_records: + # If we found any records in the child table, create shadow + # table insert statements for them and prepend them to the + # deque. + fk_columns = [c.name for c in fk_table.c] + fk_insert = fk_shadow_table.insert(inline=True).\ + from_select(fk_columns, sql.select([fk_table], + fk_column.in_(fk_records))) + inserts.appendleft(fk_insert) + # Create main table delete statements and prepend them to the + # deque. + fk_delete = fk_table.delete().where(fk_column.in_(fk_records)) + deletes.appendleft(fk_delete) + # Repeat for any possible nested child tables. + i, d = _get_fk_stmts(metadata, conn, fk_table, fk_column, fk_records) + inserts.extendleft(i) + deletes.extendleft(d) - try: - with conn.begin(): - conn.execute(query_insert) - result_delete = conn.execute(delete_statement) - return result_delete.rowcount - except db_exc.DBReferenceError as ex: - LOG.warning('Failed to archive %(table)s: %(error)s', - {'table': table.name, - 'error': six.text_type(ex)}) - return 0 + return inserts, deletes def _archive_deleted_rows_for_table(metadata, tablename, max_rows, before): """Move up to max_rows rows from one tables to the corresponding shadow table. - :returns: 2-item tuple: + Will also follow FK constraints and archive all referring rows. + Example: archving a record from the 'instances' table will also archive + the 'instance_extra' record before archiving the 'instances' record. + + :returns: 3-item tuple: - number of rows archived - list of UUIDs of instances that were archived + - number of extra rows archived (due to FK constraints) + dict of {tablename: rows_archived} """ conn = metadata.bind.connect() # NOTE(tdurakov): table metadata should be received @@ -4156,7 +4239,7 @@ shadow_table = Table(shadow_tablename, metadata, autoload=True) except NoSuchTableError: # No corresponding shadow table; skip it. - return rows_archived, deleted_instance_uuids + return rows_archived, deleted_instance_uuids, {} # TODO(stephenfin): Drop this when we drop the table if tablename == "dns_domains": @@ -4179,10 +4262,29 @@ rows = conn.execute(select).fetchall() records = [r[0] for r in rows] + # We will archive deleted rows for this table and also generate insert and + # delete statements for extra rows we may archive by following FK + # relationships. Because we are iterating over the sorted_tables (list of + # Table objects sorted in order of foreign key dependency), new inserts and + # deletes ("leaves") will be added to the fronts of the deques created in + # _get_fk_stmts. This way, we make sure we delete child table records + # before we delete their parent table records. + + # Keep track of any extra tablenames to number of rows that we archive by + # following FK relationships. + # {tablename: extra_rows_archived} + extras = collections.defaultdict(int) if records: insert = shadow_table.insert(inline=True).\ from_select(columns, sql.select([table], column.in_(records))) delete = table.delete().where(column.in_(records)) + # Walk FK relationships and add insert/delete statements for rows that + # refer to this table via FK constraints. fk_inserts and fk_deletes + # will be prepended to by _get_fk_stmts if referring rows are found by + # FK constraints. + fk_inserts, fk_deletes = _get_fk_stmts( + metadata, conn, table, column, records) + # NOTE(tssurya): In order to facilitate the deletion of records from # instance_mappings, request_specs and instance_group_member tables in # the nova_api DB, the rows of deleted instances from the instances @@ -4196,9 +4298,14 @@ try: # Group the insert and delete in a transaction. with conn.begin(): + for fk_insert in fk_inserts: + conn.execute(fk_insert) + for fk_delete in fk_deletes: + result_fk_delete = conn.execute(fk_delete) + extras[fk_delete.table.name] += result_fk_delete.rowcount conn.execute(insert) result_delete = conn.execute(delete) - rows_archived = result_delete.rowcount + rows_archived += result_delete.rowcount except db_exc.DBReferenceError as ex: # A foreign key constraint keeps us from deleting some of # these rows until we clean up a dependent table. Just @@ -4207,18 +4314,7 @@ "%(tablename)s: %(error)s", {'tablename': tablename, 'error': six.text_type(ex)}) - # NOTE(jake): instance_actions_events doesn't have a instance_uuid column - # but still needs to be archived as it is a FK constraint - if ((max_rows is None or rows_archived < max_rows) and - ('instance_uuid' in columns or - tablename == 'instance_actions_events')): - instances = models.BASE.metadata.tables['instances'] - limit = max_rows - rows_archived if max_rows is not None else None - extra = _archive_if_instance_deleted(table, shadow_table, instances, - conn, limit, before) - rows_archived += extra - - return rows_archived, deleted_instance_uuids + return rows_archived, deleted_instance_uuids, extras def archive_deleted_rows(context=None, max_rows=None, before=None): @@ -4242,13 +4338,18 @@ - list of UUIDs of instances that were archived - total number of rows that were archived """ - table_to_rows_archived = {} + table_to_rows_archived = collections.defaultdict(int) deleted_instance_uuids = [] total_rows_archived = 0 meta = MetaData(get_engine(use_slave=True, context=context)) meta.reflect() - # Reverse sort the tables so we get the leaf nodes first for processing. - for table in reversed(meta.sorted_tables): + # Get the sorted list of tables in order of foreign key dependency. + # Process the parent tables and find their dependent records in order to + # archive the related records in a single database transactions. The goal + # is to avoid a situation where, for example, an 'instances' table record + # is missing its corresponding 'instance_extra' record due to running the + # archive_deleted_rows command with max_rows. + for table in meta.sorted_tables: tablename = table.name rows_archived = 0 # skip the special sqlalchemy-migrate migrate_version table and any @@ -4256,7 +4357,7 @@ if (tablename == 'migrate_version' or tablename.startswith(_SHADOW_TABLE_PREFIX)): continue - rows_archived, _deleted_instance_uuids = ( + rows_archived, _deleted_instance_uuids, extras = ( _archive_deleted_rows_for_table( meta, tablename, max_rows=max_rows - total_rows_archived, @@ -4267,6 +4368,9 @@ # Only report results for tables that had updates. if rows_archived: table_to_rows_archived[tablename] = rows_archived + for tablename, extra_rows_archived in extras.items(): + table_to_rows_archived[tablename] += extra_rows_archived + total_rows_archived += extra_rows_archived if total_rows_archived >= max_rows: break return table_to_rows_archived, deleted_instance_uuids, total_rows_archived diff -Nru nova-22.0.1/nova/exception.py nova-22.2.2/nova/exception.py --- nova-22.0.1/nova/exception.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/exception.py 2021-07-09 05:08:56.000000000 +0200 @@ -559,6 +559,14 @@ "Unable to continue.") +class TooOldComputeService(Invalid): + msg_fmt = _("Current Nova version does not support computes older than " + "%(oldest_supported_version)s but the minimum compute service " + "level in your %(scope)s is %(min_service_level)d and the " + "oldest supported service level is " + "%(oldest_supported_service)d.") + + class DestinationDiskExists(Invalid): msg_fmt = _("The supplied disk path (%(path)s) already exists, " "it is expected not to exist.") @@ -2281,6 +2289,10 @@ "please check your conf file. ") +class GetPMEMNamespacesFailed(NovaException): + msg_fmt = _("Get PMEM namespaces on host failed: %(reason)s.") + + class VPMEMCleanupFailed(NovaException): msg_fmt = _("Failed to clean up the vpmem backend device %(dev)s: " "%(error)s") diff -Nru nova-22.0.1/nova/network/model.py nova-22.2.2/nova/network/model.py --- nova-22.0.1/nova/network/model.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/network/model.py 2021-07-09 05:08:56.000000000 +0200 @@ -469,6 +469,14 @@ return (self.is_hybrid_plug_enabled() and not migration.is_same_host()) + @property + def has_live_migration_plug_time_event(self): + """Returns whether this VIF's network-vif-plugged external event will + be sent by Neutron at "plugtime" - in other words, as soon as neutron + completes configuring the network backend. + """ + return self.is_hybrid_plug_enabled() + def is_hybrid_plug_enabled(self): return self['details'].get(VIF_DETAILS_OVS_HYBRID_PLUG, False) @@ -530,15 +538,22 @@ return jsonutils.dumps(self) def get_bind_time_events(self, migration): - """Returns whether any of our VIFs have "bind-time" events. See - has_bind_time_event() docstring for more details. + """Returns a list of external events for any VIFs that have + "bind-time" events during cold migration. """ return [('network-vif-plugged', vif['id']) for vif in self if vif.has_bind_time_event(migration)] + def get_live_migration_plug_time_events(self): + """Returns a list of external events for any VIFs that have + "plug-time" events during live migration. + """ + return [('network-vif-plugged', vif['id']) + for vif in self if vif.has_live_migration_plug_time_event] + def get_plug_time_events(self, migration): - """Complementary to get_bind_time_events(), any event that does not - fall in that category is a plug-time event. + """Returns a list of external events for any VIFs that have + "plug-time" events during cold migration. """ return [('network-vif-plugged', vif['id']) for vif in self if not vif.has_bind_time_event(migration)] diff -Nru nova-22.0.1/nova/network/neutron.py nova-22.2.2/nova/network/neutron.py --- nova-22.0.1/nova/network/neutron.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/network/neutron.py 2021-07-09 05:08:56.000000000 +0200 @@ -271,6 +271,7 @@ client = utils.get_ksa_adapter( 'network', ksa_auth=auth_plugin, ksa_session=session) client.additional_headers = {'accept': 'application/json'} + client.connect_retries = CONF.neutron.http_retries return client @@ -802,9 +803,15 @@ # TODO(arosen) Should optimize more to do direct query for security # group if len(security_groups) == 1 if len(security_groups): + # NOTE(slaweq): fields other than name and id aren't really needed + # so asking only about those fields will allow Neutron to not + # prepare list of rules for each found security group. That may + # speed processing of this request a lot in case when tenant has + # got many security groups + sg_fields = ['id', 'name'] search_opts = {'tenant_id': instance.project_id} user_security_groups = neutron.list_security_groups( - **search_opts).get('security_groups') + fields=sg_fields, **search_opts).get('security_groups') for security_group in security_groups: name_match = None diff -Nru nova-22.0.1/nova/objects/service.py nova-22.2.2/nova/objects/service.py --- nova-22.0.1/nova/objects/service.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/objects/service.py 2021-07-09 05:08:56.000000000 +0200 @@ -192,6 +192,13 @@ {'compute_rpc': '5.12'}, ) +# This is used to raise an error at service startup if older than N-1 computes +# are detected. Update this at the beginning of every release cycle +OLDEST_SUPPORTED_SERVICE_VERSION = 'Ussuri' +SERVICE_VERSION_ALIASES = { + 'Ussuri': 41 +} + # TODO(berrange): Remove NovaObjectDictCompat @base.NovaObjectRegistry.register diff -Nru nova-22.0.1/nova/pci/manager.py nova-22.2.2/nova/pci/manager.py --- nova-22.0.1/nova/pci/manager.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/pci/manager.py 2021-07-09 05:08:56.000000000 +0200 @@ -117,8 +117,42 @@ devices = [] for dev in jsonutils.loads(devices_json): - if self.dev_filter.device_assignable(dev): - devices.append(dev) + try: + if self.dev_filter.device_assignable(dev): + devices.append(dev) + except exception.PciConfigInvalidWhitelist as e: + # The raised exception is misleading as the problem is not with + # the whitelist config but with the host PCI device reported by + # libvirt. The code that matches the host PCI device to the + # withelist spec reuses the WhitelistPciAddress object to parse + # the host PCI device address. That parsing can fail if the + # PCI address has a 32 bit domain. But this should not prevent + # processing the rest of the devices. So we simply skip this + # device and continue. + # Please note that this except block does not ignore the + # invalid whitelist configuration. The whitelist config has + # already been parsed or rejected in case it was invalid. At + # this point the self.dev_filter representes the parsed and + # validated whitelist config. + LOG.debug( + 'Skipping PCI device %s reported by the hypervisor: %s', + {k: v for k, v in dev.items() + if k in ['address', 'parent_addr']}, + # NOTE(gibi): this is ugly but the device_assignable() call + # uses the PhysicalPciAddress class to parse the PCI + # addresses and that class reuses the code from + # PciAddressSpec that was originally designed to parse + # whitelist spec. Hence the raised exception talks about + # whitelist config. This is misleading as in our case the + # PCI address that we failed to parse came from the + # hypervisor. + # TODO(gibi): refactor the false abstraction to make the + # code reuse clean from the false assumption that we only + # parse whitelist config with + # devspec.PciAddressSpec._set_pci_dev_info() + str(e).replace( + 'Invalid PCI devices Whitelist config:', 'The')) + self._set_hvdevs(devices) @staticmethod diff -Nru nova-22.0.1/nova/service.py nova-22.2.2/nova/service.py --- nova-22.0.1/nova/service.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/service.py 2021-07-09 05:08:56.000000000 +0200 @@ -255,6 +255,17 @@ periodic_fuzzy_delay=periodic_fuzzy_delay, periodic_interval_max=periodic_interval_max) + # NOTE(gibi): This have to be after the service object creation as + # that is the point where we can safely use the RPC to the conductor. + # E.g. the Service.__init__ actually waits for the conductor to start + # up before it allows the service to be created. The + # raise_if_old_compute() depends on the RPC to be up and does not + # implement its own retry mechanism to connect to the conductor. + try: + utils.raise_if_old_compute() + except exception.TooOldComputeService as e: + LOG.warning(str(e)) + return service_obj def kill(self): diff -Nru nova-22.0.1/nova/storage/rbd_utils.py nova-22.2.2/nova/storage/rbd_utils.py --- nova-22.0.1/nova/storage/rbd_utils.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/storage/rbd_utils.py 2021-07-09 05:08:56.000000000 +0200 @@ -405,7 +405,14 @@ # MAX_AVAIL stat will divide by the replication size when doing the # calculation. args = ['ceph', 'df', '--format=json'] + self.ceph_args() - out, _ = processutils.execute(*args) + + try: + out, _ = processutils.execute(*args) + except processutils.ProcessExecutionError: + LOG.exception('Could not determine disk usage') + raise exception.StorageError( + reason='Could not determine disk usage') + stats = jsonutils.loads(out) # Find the pool for which we are configured. diff -Nru nova-22.0.1/nova/test.py nova-22.2.2/nova/test.py --- nova-22.0.1/nova/test.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/test.py 2021-07-09 05:08:56.000000000 +0200 @@ -51,10 +51,13 @@ from oslotest import mock_fixture import six from six.moves import builtins +from sqlalchemy.dialects import sqlite import testtools +from nova.api.openstack import wsgi_app from nova.compute import rpcapi as compute_rpcapi from nova import context +from nova.db.sqlalchemy import api as sqlalchemy_api from nova import exception from nova import objects from nova.objects import base as objects_base @@ -283,6 +286,10 @@ self.useFixture(nova_fixtures.GenericPoisonFixture()) + # make sure that the wsgi app is fully initialized for all testcase + # instead of only once initialized for test worker + wsgi_app.init_global_data.reset() + def _setup_cells(self): """Setup a normal cellsv2 environment. @@ -376,6 +383,22 @@ for k, v in kw.items(): CONF.set_override(k, v, group) + def enforce_fk_constraints(self, engine=None): + if engine is None: + engine = sqlalchemy_api.get_engine() + dialect = engine.url.get_dialect() + if dialect == sqlite.dialect: + # We're seeing issues with foreign key support in SQLite 3.6.20 + # SQLAlchemy doesn't support it at all with < SQLite 3.6.19 + # It works fine in SQLite 3.7. + # So return early to skip this test if running SQLite < 3.7 + import sqlite3 + tup = sqlite3.sqlite_version_info + if tup[0] < 3 or (tup[0] == 3 and tup[1] < 7): + self.skipTest( + 'sqlite version too old for reliable SQLA foreign_keys') + engine.connect().execute("PRAGMA foreign_keys = ON") + def start_service(self, name, host=None, cell_name=None, **kwargs): # Disallow starting multiple scheduler services if name == 'scheduler' and self._service_fixture_count[name]: diff -Nru nova-22.0.1/nova/tests/functional/db/test_archive.py nova-22.2.2/nova/tests/functional/db/test_archive.py --- nova-22.0.1/nova/tests/functional/db/test_archive.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/functional/db/test_archive.py 2021-07-09 05:08:56.000000000 +0200 @@ -17,7 +17,6 @@ from dateutil import parser as dateutil_parser from oslo_utils import timeutils -from sqlalchemy.dialects import sqlite from sqlalchemy import func from sqlalchemy import MetaData from sqlalchemy import select @@ -33,22 +32,7 @@ def setUp(self): super(TestDatabaseArchive, self).setUp() - # TODO(mriedem): pull this out so we can re-use it in - # test_archive_deleted_rows_fk_constraint - # SQLite doesn't enforce foreign key constraints without a pragma. - engine = sqlalchemy_api.get_engine() - dialect = engine.url.get_dialect() - if dialect == sqlite.dialect: - # We're seeing issues with foreign key support in SQLite 3.6.20 - # SQLAlchemy doesn't support it at all with < SQLite 3.6.19 - # It works fine in SQLite 3.7. - # So return early to skip this test if running SQLite < 3.7 - import sqlite3 - tup = sqlite3.sqlite_version_info - if tup[0] < 3 or (tup[0] == 3 and tup[1] < 7): - self.skipTest( - 'sqlite version too old for reliable SQLA foreign_keys') - engine.connect().execute("PRAGMA foreign_keys = ON") + self.enforce_fk_constraints() def test_archive_deleted_rows(self): # Boots a server, deletes it, and then tries to archive it. @@ -114,6 +98,19 @@ # Verify we have some system_metadata since we'll check that later. self.assertTrue(len(instance.system_metadata), 'No system_metadata for instance: %s' % server_id) + # Create a pci_devices record to simulate an instance that had a PCI + # device allocated at the time it was deleted. There is a window of + # time between deletion of the instance record and freeing of the PCI + # device in nova-compute's _complete_deletion method during RT update. + db.pci_device_update(admin_context, 1, 'fake-address', + {'compute_node_id': 1, + 'address': 'fake-address', + 'vendor_id': 'fake', + 'product_id': 'fake', + 'dev_type': 'fake', + 'label': 'fake', + 'status': 'allocated', + 'instance_uuid': instance.uuid}) # Now try and archive the soft deleted records. results, deleted_instance_uuids, archived = \ db.archive_deleted_rows(max_rows=100) @@ -128,6 +125,49 @@ self.assertIn('instance_actions', results) self.assertIn('instance_actions_events', results) self.assertEqual(sum(results.values()), archived) + # Verify that the pci_devices record has not been dropped + self.assertNotIn('pci_devices', results) + + def test_archive_deleted_rows_incomplete(self): + """This tests a scenario where archive_deleted_rows is run with + --max_rows and does not run to completion. + + That is, the archive is stopped before all archivable records have been + archived. Specifically, the problematic state is when a single instance + becomes partially archived (example: 'instance_extra' record for one + instance has been archived while its 'instances' record remains). Any + access of the instance (example: listing deleted instances) that + triggers the retrieval of a dependent record that has been archived + away, results in undefined behavior that may raise an error. + + We will force the system into a state where a single deleted instance + is partially archived. We want to verify that we can, for example, + successfully do a GET /servers/detail at any point between partial + archive_deleted_rows runs without errors. + """ + # Boots a server, deletes it, and then tries to archive it. + server = self._create_server() + server_id = server['id'] + # Assert that there are instance_actions. instance_actions are + # interesting since we don't soft delete them but they have a foreign + # key back to the instances table. + actions = self.api.get_instance_actions(server_id) + self.assertTrue(len(actions), + 'No instance actions for server: %s' % server_id) + self._delete_server(server) + # Archive deleted records iteratively, 1 row at a time, and try to do a + # GET /servers/detail between each run. All should succeed. + exceptions = [] + while True: + _, _, archived = db.archive_deleted_rows(max_rows=1) + try: + # Need to use the admin API to list deleted servers. + self.admin_api.get_servers(search_opts={'deleted': True}) + except Exception as ex: + exceptions.append(ex) + if archived == 0: + break + self.assertFalse(exceptions) def _get_table_counts(self): engine = sqlalchemy_api.get_engine() diff -Nru nova-22.0.1/nova/tests/functional/regressions/test_bug_1896463.py nova-22.2.2/nova/tests/functional/regressions/test_bug_1896463.py --- nova-22.0.1/nova/tests/functional/regressions/test_bug_1896463.py 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/nova/tests/functional/regressions/test_bug_1896463.py 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,222 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import fixtures +import time + +from oslo_config import cfg + +from nova import context +from nova import objects +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import fixtures as func_fixtures +from nova.tests.functional import integrated_helpers +from nova import utils +from nova.virt import fake + + +CONF = cfg.CONF + + +class TestEvacuateResourceTrackerRace( + test.TestCase, integrated_helpers.InstanceHelperMixin, +): + """Demonstrate bug #1896463. + + Trigger a race condition between an almost finished evacuation that is + dropping the migration context, and the _update_available_resource() + periodic task that already loaded the instance list but haven't loaded the + migration list yet. The result is that the PCI allocation made by the + evacuation is deleted by the overlapping periodic task run and the instance + will not have PCI allocation after the evacuation. + """ + + def setUp(self): + super().setUp() + self.neutron = self.useFixture(nova_fixtures.NeutronFixture(self)) + self.glance = self.useFixture(nova_fixtures.GlanceFixture(self)) + self.placement = self.useFixture(func_fixtures.PlacementFixture()).api + + self.api_fixture = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')) + + self.admin_api = self.api_fixture.admin_api + self.admin_api.microversion = 'latest' + self.api = self.admin_api + + self.start_service('conductor') + self.start_service('scheduler') + + self.flags(compute_driver='fake.FakeDriverWithPciResources') + self.useFixture( + fake.FakeDriverWithPciResources. + FakeDriverWithPciResourcesConfigFixture()) + + self.compute1 = self._start_compute('host1') + self.compute1_id = self._get_compute_node_id_by_host('host1') + self.compute1_service_id = self.admin_api.get_services( + host='host1', binary='nova-compute')[0]['id'] + + self.compute2 = self._start_compute('host2') + self.compute2_id = self._get_compute_node_id_by_host('host2') + self.compute2_service_id = self.admin_api.get_services( + host='host2', binary='nova-compute')[0]['id'] + + # add extra ports and the related network to the neutron fixture + # specifically for these tests. It cannot be added globally in the + # fixture init as it adds a second network that makes auto allocation + # based test to fail due to ambiguous networks. + self.neutron._ports[self.neutron.sriov_port['id']] = \ + copy.deepcopy(self.neutron.sriov_port) + self.neutron._networks[ + self.neutron.network_2['id']] = self.neutron.network_2 + self.neutron._subnets[ + self.neutron.subnet_2['id']] = self.neutron.subnet_2 + + self.ctxt = context.get_admin_context() + + def _get_compute_node_id_by_host(self, host): + # we specifically need the integer id of the node not the UUID so we + # need to use the old microversion + with utils.temporary_mutation(self.admin_api, microversion='2.52'): + hypers = self.admin_api.api_get( + 'os-hypervisors').body['hypervisors'] + for hyper in hypers: + if hyper['hypervisor_hostname'] == host: + return hyper['id'] + + self.fail('Hypervisor with hostname=%s not found' % host) + + def _assert_pci_device_allocated( + self, instance_uuid, compute_node_id, num=1): + """Assert that a given number of PCI devices are allocated to the + instance on the given host. + """ + + devices = objects.PciDeviceList.get_by_instance_uuid( + self.ctxt, instance_uuid) + devices_on_host = [dev for dev in devices + if dev.compute_node_id == compute_node_id] + self.assertEqual(num, len(devices_on_host)) + + def test_evacuate_races_with_update_available_resource(self): + # Create a server with a direct port to have PCI allocation + server = self._create_server( + name='test-server-for-bug-1896463', + networks=[{'port': self.neutron.sriov_port['id']}], + host='host1' + ) + + self._assert_pci_device_allocated(server['id'], self.compute1_id) + self._assert_pci_device_allocated( + server['id'], self.compute2_id, num=0) + + # stop and force down the compute the instance is on to allow + # evacuation + self.compute1.stop() + self.admin_api.put_service( + self.compute1_service_id, {'forced_down': 'true'}) + + # Inject some sleeps both in the Instance.drop_migration_context and + # the MigrationList.get_in_progress_and_error code to make them + # overlap. + # We want to create the following execution scenario: + # 1) The evacuation makes a move claim on the dest including the PCI + # claim. This means there is a migration context. But the evacuation + # is not complete yet so the instance.host does not point to the + # dest host. + # 2) The dest resource tracker starts an _update_available_resource() + # periodic task and this task loads the list of instances on its + # host from the DB. Our instance is not in this list due to #1. + # 3) The evacuation finishes, the instance.host is set to the dest host + # and the migration context is deleted. + # 4) The periodic task now loads the list of in-progress migration from + # the DB to check for incoming our outgoing migrations. However due + # to #3 our instance is not in this list either. + # 5) The periodic task cleans up every lingering PCI claim that is not + # connected to any instance collected above from the instance list + # and from the migration list. As our instance is not in either of + # the lists, the resource tracker cleans up the PCI allocation for + # the already finished evacuation of our instance. + # + # Unfortunately we cannot reproduce the above situation without sleeps. + # We need that the evac starts first then the periodic starts, but not + # finishes, then evac finishes, then periodic finishes. If I trigger + # and run the whole periodic in a wrapper of drop_migration_context + # then I could not reproduce the situation described at #4). In general + # it is not + # + # evac + # | + # | + # | periodic + # | | + # | | + # | x + # | + # | + # x + # + # but + # + # evac + # | + # | + # | periodic + # | | + # | | + # | | + # x | + # | + # x + # + # what is needed need. + # + # Starting the periodic from the test in a separate thread at + # drop_migration_context() might work but that is an extra complexity + # in the test code. Also it might need a sleep still to make the + # reproduction stable but only one sleep instead of two. + orig_drop = objects.Instance.drop_migration_context + + def slow_drop(*args, **kwargs): + time.sleep(1) + return orig_drop(*args, **kwargs) + + self.useFixture( + fixtures.MockPatch( + 'nova.objects.instance.Instance.drop_migration_context', + new=slow_drop)) + + orig_get_mig = objects.MigrationList.get_in_progress_and_error + + def slow_get_mig(*args, **kwargs): + time.sleep(2) + return orig_get_mig(*args, **kwargs) + + self.useFixture( + fixtures.MockPatch( + 'nova.objects.migration.MigrationList.' + 'get_in_progress_and_error', + new=slow_get_mig)) + + self.admin_api.post_server_action(server['id'], {'evacuate': {}}) + # we trigger the _update_available_resource periodic to overlap with + # the already started evacuation + self._run_periodics() + + self._wait_for_server_parameter( + server, {'OS-EXT-SRV-ATTR:host': 'host2', 'status': 'ACTIVE'}) + + self._assert_pci_device_allocated(server['id'], self.compute1_id) + self._assert_pci_device_allocated(server['id'], self.compute2_id) diff -Nru nova-22.0.1/nova/tests/functional/regressions/test_bug_1899649.py nova-22.2.2/nova/tests/functional/regressions/test_bug_1899649.py --- nova-22.0.1/nova/tests/functional/regressions/test_bug_1899649.py 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/nova/tests/functional/regressions/test_bug_1899649.py 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,100 @@ +# Copyright 2020, Red Hat, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional.libvirt import base +from nova.tests.unit.virt.libvirt import fakelibvirt + + +class TestVolAttachmentsAfterFailureToScheduleOrBuild(base.ServersTestBase): + """Regression test for bug . + + This regression test aims to ensure a volume attachment remains in place + after a failure to either schedule a server or when building a server + directly on a compute after skipping the scheduler. + + A volume attachment is required to remain after such failures to ensure the + volume itself remains marked as reserved. + + To ensure this is as accurate as possible the tests use the libvirt + functional base class to mimic a real world example with NUMA nodes being + requested via flavor extra specs. The underlying compute being unable to + meet this request ensuring a failure. + """ + + microversion = 'latest' + + def setUp(self): + super().setUp() + + # Launch a single libvirt based compute service with a single NUMA node + host_info = fakelibvirt.HostInfo( + cpu_nodes=1, cpu_sockets=1, cpu_cores=2, kB_mem=15740000) + self.start_compute(host_info=host_info, hostname='compute1') + + # Use a flavor requesting 2 NUMA nodes that we know will always fail + self.flavor_id = self._create_flavor(extra_spec={'hw:numa_nodes': '2'}) + + # Craft a common bfv server request for use within each test + self.volume_id = nova_fixtures.CinderFixture.IMAGE_BACKED_VOL + self.server = { + 'name': 'test', + 'flavorRef': self.flavor_id, + 'imageRef': '', + 'networks': 'none', + 'block_device_mapping_v2': [{ + 'source_type': 'volume', + 'destination_type': 'volume', + 'boot_index': 0, + 'uuid': self.volume_id}] + } + + def _assert_failure_and_volume_attachments(self, server): + # Assert that the server is in an ERROR state + self._wait_for_state_change(server, 'ERROR') + + # Assert that the volume is in a reserved state. As this isn't modelled + # by the CinderFixture we just assert that a single volume attachment + # remains after the failure and that it is referenced by the server. + attachments = self.cinder.volume_to_attachment.get(self.volume_id) + self.assertEqual(1, len(attachments)) + self.assertIn( + self.volume_id, self.cinder.volume_ids_for_instance(server['id'])) + + def test_failure_to_schedule(self): + # Assert that a volume attachment remains after a failure to schedule + server = self.api.post_server({'server': self.server}) + self._assert_failure_and_volume_attachments(server) + + def test_failure_to_schedule_with_az(self): + # Assert that a volume attachment remains after a failure to schedule + # with the addition of an availability_zone in the request + self.server['availability_zone'] = 'nova' + server = self.api.post_server({'server': self.server}) + self._assert_failure_and_volume_attachments(server) + + def test_failure_to_schedule_with_host(self): + # Assert that a volume attachment remains after a failure to schedule + # using the optional host parameter introduced in microversion 2.74 + self.server['host'] = 'compute1' + server = self.admin_api.post_server({'server': self.server}) + self._assert_failure_and_volume_attachments(server) + + def test_failure_to_build_with_az_and_host(self): + # Assert that a volume attachments remain after a failure to + # build and reschedule by providing an availability_zone *and* host, + # skipping the scheduler. This is bug #1899649. + self.server['availability_zone'] = 'nova:compute1' + server = self.admin_api.post_server({'server': self.server}) + self._assert_failure_and_volume_attachments(server) diff -Nru nova-22.0.1/nova/tests/functional/regressions/test_bug_1914777.py nova-22.2.2/nova/tests/functional/regressions/test_bug_1914777.py --- nova-22.0.1/nova/tests/functional/regressions/test_bug_1914777.py 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/nova/tests/functional/regressions/test_bug_1914777.py 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,138 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import mock + +from nova import context as nova_context +from nova import exception +from nova import objects +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import integrated_helpers +from nova.tests.unit import policy_fixture + + +class TestDeleteWhileBooting(test.TestCase, + integrated_helpers.InstanceHelperMixin): + """This tests race scenarios where an instance is deleted while booting. + + In these scenarios, the nova-api service is racing with nova-conductor + service; nova-conductor is in the middle of booting the instance when + nova-api begins fulfillment of a delete request. As the two services + delete records out from under each other, both services need to handle + it properly such that a delete request will always be fulfilled. + + Another scenario where two requests can race and delete things out from + under each other is if two or more delete requests are racing while the + instance is booting. + + In order to force things into states where bugs have occurred, we must + mock some object retrievals from the database to simulate the different + points at which a delete request races with a create request or another + delete request. We aim to mock only the bare minimum necessary to recreate + the bug scenarios. + """ + def setUp(self): + super(TestDeleteWhileBooting, self).setUp() + self.useFixture(policy_fixture.RealPolicyFixture()) + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.useFixture(nova_fixtures.GlanceFixture(self)) + + api_fixture = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')) + self.api = api_fixture.api + + self.ctxt = nova_context.get_context() + + # We intentionally do not start a conductor or scheduler service, since + # our goal is to simulate an instance that has not been scheduled yet. + + # Kick off a server create request and move on once it's in the BUILD + # state. Since we have no conductor or scheduler service running, the + # server will "hang" in an unscheduled state for testing. + self.server = self._create_server(expected_state='BUILD') + # Simulate that a different request has deleted the build request + # record after this delete request has begun processing. (The first + # lookup of the build request occurs in the servers API to get the + # instance object in order to delete it). + # We need to get the build request now before we mock the method. + self.br = objects.BuildRequest.get_by_instance_uuid( + self.ctxt, self.server['id']) + + @mock.patch('nova.objects.build_request.BuildRequest.get_by_instance_uuid') + def test_build_request_and_instance_not_found(self, mock_get_br): + """This tests a scenario where another request has deleted the build + request record and the instance record ahead of us. + """ + # The first lookup at the beginning of the delete request in the + # ServersController succeeds and the second lookup to handle "delete + # while booting" in compute/api fails after a different request has + # deleted it. + br_not_found = exception.BuildRequestNotFound(uuid=self.server['id']) + mock_get_br.side_effect = [self.br, br_not_found, br_not_found] + self._delete_server(self.server) + + @mock.patch('nova.objects.build_request.BuildRequest.get_by_instance_uuid') + @mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid') + @mock.patch('nova.objects.instance.Instance.get_by_uuid') + def test_deleting_instance_at_the_same_time(self, mock_get_i, mock_get_im, + mock_get_br): + """This tests the scenario where another request is trying to delete + the instance record at the same time we are, while the instance is + booting. An example of this: while the create and delete are running at + the same time, the delete request deletes the build request, the create + request finds the build request already deleted when it tries to delete + it. The create request deletes the instance record and then delete + request tries to lookup the instance after it deletes the build + request. Its attempt to lookup the instance fails because the create + request already deleted it. + """ + # First lookup at the beginning of the delete request in the + # ServersController succeeds, second lookup to handle "delete while + # booting" in compute/api fails after the conductor has deleted it. + br_not_found = exception.BuildRequestNotFound(uuid=self.server['id']) + mock_get_br.side_effect = [self.br, br_not_found] + # Simulate the instance transitioning from having no cell assigned to + # having a cell assigned while the delete request is being processed. + # First lookup of the instance mapping has the instance unmapped (no + # cell) and subsequent lookups have the instance mapped to cell1. + no_cell_im = objects.InstanceMapping( + context=self.ctxt, instance_uuid=self.server['id'], + cell_mapping=None) + has_cell_im = objects.InstanceMapping( + context=self.ctxt, instance_uuid=self.server['id'], + cell_mapping=self.cell_mappings['cell1']) + mock_get_im.side_effect = [ + no_cell_im, has_cell_im, has_cell_im, has_cell_im, has_cell_im] + # Simulate that the instance object has been created by the conductor + # in the create path while the delete request is being processed. + # First lookups are before the instance has been deleted and the last + # lookup is after the conductor has deleted the instance. Use the build + # request to make an instance object for testing. + i = self.br.get_new_instance(self.ctxt) + i_not_found = exception.InstanceNotFound(instance_id=self.server['id']) + mock_get_i.side_effect = [i, i, i, i_not_found, i_not_found] + + # Simulate that the conductor is running instance_destroy at the same + # time as we are. + def fake_instance_destroy(*args, **kwargs): + # NOTE(melwitt): This is a misleading exception, as it is not only + # raised when a constraint on 'host' is not met, but also when two + # instance_destroy calls are racing. In this test, the soft delete + # returns 0 rows affected because another request soft deleted the + # record first. + raise exception.ObjectActionError( + action='destroy', reason='host changed') + + self.stub_out( + 'nova.objects.instance.Instance.destroy', fake_instance_destroy) + self._delete_server(self.server) diff -Nru nova-22.0.1/nova/tests/functional/test_nova_manage.py nova-22.2.2/nova/tests/functional/test_nova_manage.py --- nova-22.0.1/nova/tests/functional/test_nova_manage.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/functional/test_nova_manage.py 2021-07-09 05:08:56.000000000 +0200 @@ -1767,6 +1767,7 @@ def setUp(self): super(TestDBArchiveDeletedRows, self).setUp() + self.enforce_fk_constraints() self.cli = manage.DbCommands() self.output = StringIO() self.useFixture(fixtures.MonkeyPatch('sys.stdout', self.output)) @@ -1812,6 +1813,7 @@ def setUp(self): super(TestDBArchiveDeletedRowsMultiCell, self).setUp() + self.enforce_fk_constraints() self.useFixture(nova_fixtures.NeutronFixture(self)) self.useFixture(nova_fixtures.GlanceFixture(self)) self.useFixture(func_fixtures.PlacementFixture()) diff -Nru nova-22.0.1/nova/tests/functional/test_servers.py nova-22.2.2/nova/tests/functional/test_servers.py --- nova-22.0.1/nova/tests/functional/test_servers.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/functional/test_servers.py 2021-07-09 05:08:56.000000000 +0200 @@ -40,6 +40,8 @@ from nova.network import neutron as neutronapi from nova import objects from nova.objects import block_device as block_device_obj +from nova.policies import base as base_policies +from nova.policies import servers as servers_policies from nova.scheduler import utils from nova import test from nova.tests import fixtures as nova_fixtures @@ -8256,3 +8258,115 @@ 'OS-DCF:diskConfig': 'AUTO'}}) self.assertEqual(403, ex.response.status_code) self._check_allocations_usage(self.server) + + +class CrossCellResizeWithQoSPort(PortResourceRequestBasedSchedulingTestBase): + NUMBER_OF_CELLS = 2 + + def setUp(self): + # Use our custom weigher defined above to make sure that we have + # a predictable host order in the alternate list returned by the + # scheduler for migration. + self.useFixture(nova_fixtures.HostNameWeigherFixture()) + super(CrossCellResizeWithQoSPort, self).setUp() + # start compute2 in cell2, compute1 is started in cell1 by default + self.compute2 = self._start_compute('host2', cell_name='cell2') + self.compute2_rp_uuid = self._get_provider_uuid_by_host('host2') + self._create_networking_rp_tree('host2', self.compute2_rp_uuid) + self.compute2_service_id = self.admin_api.get_services( + host='host2', binary='nova-compute')[0]['id'] + + # Enable cross-cell resize policy since it defaults to not allow + # anyone to perform that type of operation. For these tests we'll + # just allow admins to perform cross-cell resize. + self.policy.set_rules({ + servers_policies.CROSS_CELL_RESIZE: + base_policies.RULE_ADMIN_API}, + overwrite=False) + + def test_cross_cell_migrate_server_with_qos_ports(self): + """Test that cross cell migration is not supported with qos ports and + nova therefore falls back to do a same cell migration instead. + To test this properly we first make sure that there is no valid host + in the same cell but there is valid host in another cell and observe + that the migration fails with NoValidHost. Then we start a new compute + in the same cell the instance is in and retry the migration that is now + expected to pass. + """ + + non_qos_normal_port = self.neutron.port_1 + qos_normal_port = self.neutron.port_with_resource_request + qos_sriov_port = self.neutron.port_with_sriov_resource_request + + server = self._create_server_with_ports_and_check_allocation( + non_qos_normal_port, qos_normal_port, qos_sriov_port) + + orig_create_binding = neutronapi.API._create_port_binding + + hosts = { + 'host1': self.compute1_rp_uuid, 'host2': self.compute2_rp_uuid} + + # Add an extra check to our neutron fixture. This check makes sure that + # the RP sent in the binding corresponds to host of the binding. In a + # real deployment this is checked by the Neutron server. As bug + # 1907522 showed we fail this check for cross cell migration with qos + # ports in a real deployment. So to reproduce that bug we need to have + # the same check in our test env too. + def spy_on_create_binding(context, client, port_id, data): + host_rp_uuid = hosts[data['binding']['host']] + device_rp_uuid = data['binding']['profile'].get('allocation') + if port_id == qos_normal_port['id']: + if device_rp_uuid != self.ovs_bridge_rp_per_host[host_rp_uuid]: + raise exception.PortBindingFailed(port_id=port_id) + elif port_id == qos_sriov_port['id']: + if (device_rp_uuid not in + self.sriov_dev_rp_per_host[host_rp_uuid].values()): + raise exception.PortBindingFailed(port_id=port_id) + + return orig_create_binding(context, client, port_id, data) + + with mock.patch( + 'nova.network.neutron.API._create_port_binding', + side_effect=spy_on_create_binding, autospec=True + ): + # We expect the migration to fail as the only available target + # host is in a different cell and while cross cell migration is + # enabled it is not supported for neutron ports with resource + # request. + self.api.post_server_action(server['id'], {'migrate': None}) + self._wait_for_migration_status(server, ['error']) + self._wait_for_server_parameter( + server, + {'status': 'ACTIVE', 'OS-EXT-SRV-ATTR:host': 'host1'}) + event = self._wait_for_action_fail_completion( + server, 'migrate', 'conductor_migrate_server') + self.assertIn( + 'exception.NoValidHost', event['traceback']) + log = self.stdlog.logger.output + self.assertIn( + 'Request is allowed by policy to perform cross-cell resize ' + 'but the instance has ports with resource request and ' + 'cross-cell resize is not supported with such ports.', + log) + self.assertNotIn( + 'nova.exception.PortBindingFailed: Binding failed for port', + log) + self.assertNotIn( + "AttributeError: 'NoneType' object has no attribute 'version'", + log) + + # Now start a new compute in the same cell as the instance and retry + # the migration. + self._start_compute('host3', cell_name='cell1') + self.compute3_rp_uuid = self._get_provider_uuid_by_host('host3') + self._create_networking_rp_tree('host3', self.compute3_rp_uuid) + + with mock.patch( + 'nova.network.neutron.API._create_port_binding', + side_effect=spy_on_create_binding, autospec=True + ): + server = self._migrate_server(server) + self.assertEqual('host3', server['OS-EXT-SRV-ATTR:host']) + + self._delete_server_and_check_allocations( + server, qos_normal_port, qos_sriov_port) diff -Nru nova-22.0.1/nova/tests/functional/test_service.py nova-22.2.2/nova/tests/functional/test_service.py --- nova-22.0.1/nova/tests/functional/test_service.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/functional/test_service.py 2021-07-09 05:08:56.000000000 +0200 @@ -10,7 +10,10 @@ # License for the specific language governing permissions and limitations # under the License. +from unittest import mock + from nova import context as nova_context +from nova.objects import service from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional import fixtures as func_fixtures @@ -97,3 +100,40 @@ self.metadata.start() # Cell cache should be empty after the service reset. self.assertEqual({}, nova_context.CELL_CACHE) + + +class TestOldComputeCheck( + test.TestCase, integrated_helpers.InstanceHelperMixin): + + def test_conductor_warns_if_old_compute(self): + old_version = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] - 1 + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=old_version): + self.start_service('conductor') + self.assertIn( + 'Current Nova version does not support computes older than', + self.stdlog.logger.output) + + def test_api_warns_if_old_compute(self): + old_version = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] - 1 + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=old_version): + self.useFixture(nova_fixtures.OSAPIFixture(api_version='v2.1')) + self.assertIn( + 'Current Nova version does not support computes older than', + self.stdlog.logger.output) + + def test_compute_warns_if_old_compute(self): + old_version = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] - 1 + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=old_version): + self._start_compute('host1') + self.assertIn( + 'Current Nova version does not support computes older than', + self.stdlog.logger.output) diff -Nru nova-22.0.1/nova/tests/unit/api/openstack/compute/test_server_reset_state.py nova-22.2.2/nova/tests/unit/api/openstack/compute/test_server_reset_state.py --- nova-22.0.1/nova/tests/unit/api/openstack/compute/test_server_reset_state.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/api/openstack/compute/test_server_reset_state.py 2021-07-09 05:08:56.000000000 +0200 @@ -18,6 +18,7 @@ from nova.api.openstack.compute import admin_actions \ as admin_actions_v21 +from nova.compute import instance_actions from nova.compute import vm_states from nova import exception from nova import objects @@ -59,18 +60,27 @@ def _get_request(self): return fakes.HTTPRequest.blank('') + @mock.patch( + 'nova.objects.instance_action.InstanceAction.action_start', + new=mock.Mock(spec=objects.InstanceAction)) def test_no_state(self): self.assertRaises(self.bad_request, self.admin_api._reset_state, self.request, self.uuid, body={"os-resetState": None}) + @mock.patch( + 'nova.objects.instance_action.InstanceAction.action_start', + new=mock.Mock(spec=objects.InstanceAction)) def test_bad_state(self): self.assertRaises(self.bad_request, self.admin_api._reset_state, self.request, self.uuid, body={"os-resetState": {"state": "spam"}}) + @mock.patch( + 'nova.objects.instance_action.InstanceAction.action_start', + new=mock.Mock(spec=objects.InstanceAction)) def test_no_instance(self): self.compute_api.get = mock.MagicMock( side_effect=exception.InstanceNotFound(instance_id='inst_ud')) @@ -84,11 +94,14 @@ self.context, self.uuid, expected_attrs=None, cell_down_support=False) - def test_reset_active(self): + @mock.patch('nova.objects.instance_action.InstanceAction.action_start') + def test_reset_active(self, mock_instance_action_start): expected = dict(vm_state=vm_states.ACTIVE, task_state=None) self.instance.save = mock.MagicMock( side_effect=lambda **kw: self._check_instance_state(expected)) self.compute_api.get = mock.MagicMock(return_value=self.instance) + mock_instance_action = mock.Mock(spec=objects.InstanceAction) + mock_instance_action_start.return_value = mock_instance_action body = {"os-resetState": {"state": "active"}} result = self.admin_api._reset_state(self.request, self.uuid, @@ -107,11 +120,18 @@ cell_down_support=False) self.instance.save.assert_called_once_with(admin_state_reset=True) - def test_reset_error(self): + mock_instance_action_start.assert_called_once_with( + self.context, self.instance.uuid, instance_actions.RESET_STATE) + mock_instance_action.finish.assert_called_once() + + @mock.patch('nova.objects.instance_action.InstanceAction.action_start') + def test_reset_error(self, mock_instance_action_start): expected = dict(vm_state=vm_states.ERROR, task_state=None) self.instance.save = mock.MagicMock( side_effect=lambda **kw: self._check_instance_state(expected)) self.compute_api.get = mock.MagicMock(return_value=self.instance) + mock_instance_action = mock.Mock(spec=objects.InstanceAction) + mock_instance_action_start.return_value = mock_instance_action body = {"os-resetState": {"state": "error"}} result = self.admin_api._reset_state(self.request, self.uuid, @@ -129,3 +149,7 @@ self.context, self.instance.uuid, expected_attrs=None, cell_down_support=False) self.instance.save.assert_called_once_with(admin_state_reset=True) + + mock_instance_action_start.assert_called_once_with( + self.context, self.instance.uuid, instance_actions.RESET_STATE) + mock_instance_action.finish.assert_called_once() diff -Nru nova-22.0.1/nova/tests/unit/api/openstack/test_requestlog.py nova-22.2.2/nova/tests/unit/api/openstack/test_requestlog.py --- nova-22.0.1/nova/tests/unit/api/openstack/test_requestlog.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/api/openstack/test_requestlog.py 2021-07-09 05:08:56.000000000 +0200 @@ -42,7 +42,8 @@ # this is the minimal set of magic mocks needed to convince # the API service it can start on it's own without a database. mocks = ['nova.objects.Service.get_by_host_and_binary', - 'nova.objects.Service.create'] + 'nova.objects.Service.create', + 'nova.utils.raise_if_old_compute'] self.stdlog = fixtures.StandardLogging() self.useFixture(self.stdlog) for m in mocks: diff -Nru nova-22.0.1/nova/tests/unit/api/openstack/test_wsgi_app.py nova-22.2.2/nova/tests/unit/api/openstack/test_wsgi_app.py --- nova-22.0.1/nova/tests/unit/api/openstack/test_wsgi_app.py 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/api/openstack/test_wsgi_app.py 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,85 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import tempfile + +import fixtures +import mock +from oslo_config import fixture as config_fixture +from oslotest import base + +from nova.api.openstack import wsgi_app +from nova import test +from nova.tests import fixtures as nova_fixtures + + +class WSGIAppTest(base.BaseTestCase): + + _paste_config = """ +[app:nova-api] +use = egg:Paste#static +document_root = /tmp + """ + + def setUp(self): + # Ensure BaseTestCase's ConfigureLogging fixture is disabled since + # we're using our own (StandardLogging). + with fixtures.EnvironmentVariable('OS_LOG_CAPTURE', '0'): + super(WSGIAppTest, self).setUp() + self.stdlog = self.useFixture(nova_fixtures.StandardLogging()) + self.conf = tempfile.NamedTemporaryFile(mode='w+t') + self.conf.write(self._paste_config.lstrip()) + self.conf.seek(0) + self.conf.flush() + self.addCleanup(self.conf.close) + # Use of this fixture takes care of cleaning up config settings for + # subsequent tests. + self.useFixture(config_fixture.Config()) + + @mock.patch('sys.argv', return_value=mock.sentinel.argv) + @mock.patch('nova.db.sqlalchemy.api.configure') + @mock.patch('nova.api.openstack.wsgi_app._setup_service') + @mock.patch('nova.api.openstack.wsgi_app._get_config_files') + def test_init_application_called_twice(self, mock_get_files, mock_setup, + mock_db_configure, mock_argv): + """Test that init_application can tolerate being called twice in a + single python interpreter instance. + + When nova-api is run via mod_wsgi, if any exception is raised during + init_application, mod_wsgi will re-run the WSGI script without + restarting the daemon process even when configured for Daemon Mode. + + We access the database as part of init_application, so if nova-api + starts up before the database is up, we'll get, for example, a + DBConnectionError raised during init_application and our WSGI script + will get reloaded/re-run by mod_wsgi. + """ + mock_get_files.return_value = [self.conf.name] + mock_setup.side_effect = [test.TestingException, None] + # We need to mock the global database configure() method, else we will + # be affected by global database state altered by other tests that ran + # before this test, causing this test to fail with + # oslo_db.sqlalchemy.enginefacade.AlreadyStartedError. We can instead + # mock the method to raise an exception if it's called a second time in + # this test to simulate the fact that the database does not tolerate + # re-init [after a database query has been made]. + mock_db_configure.side_effect = [None, test.TestingException] + # Run init_application the first time, simulating an exception being + # raised during it. + self.assertRaises(test.TestingException, wsgi_app.init_application, + 'nova-api') + # Now run init_application a second time, it should succeed since no + # exception is being raised (the init of global data should not be + # re-attempted). + wsgi_app.init_application('nova-api') + self.assertIn('Global data already initialized, not re-initializing.', + self.stdlog.logger.output) diff -Nru nova-22.0.1/nova/tests/unit/cmd/test_status.py nova-22.2.2/nova/tests/unit/cmd/test_status.py --- nova-22.0.1/nova/tests/unit/cmd/test_status.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/cmd/test_status.py 2021-07-09 05:08:56.000000000 +0200 @@ -44,6 +44,7 @@ # NOTE(mriedem): We only use objects as a convenience to populate the database # in the tests, we don't use them in the actual CLI. from nova import objects +from nova.objects import service from nova import policy from nova import test from nova.tests import fixtures as nova_fixtures @@ -667,3 +668,32 @@ jsonutils.dump(self.data, fh) self.assertEqual(upgradecheck.Code.FAILURE, self.cmd._check_policy_json().code) + + +class TestUpgradeCheckOldCompute(test.NoDBTestCase): + + def setUp(self): + super(TestUpgradeCheckOldCompute, self).setUp() + self.cmd = status.UpgradeCommands() + + def test_no_compute(self): + self.assertEqual( + upgradecheck.Code.SUCCESS, self.cmd._check_old_computes().code) + + def test_only_new_compute(self): + last_supported_version = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=last_supported_version): + self.assertEqual( + upgradecheck.Code.SUCCESS, self.cmd._check_old_computes().code) + + def test_old_compute(self): + too_old = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] - 1 + with mock.patch( + "nova.objects.service.get_minimum_version_all_cells", + return_value=too_old): + result = self.cmd._check_old_computes() + self.assertEqual(upgradecheck.Code.WARNING, result.code) diff -Nru nova-22.0.1/nova/tests/unit/compute/test_api.py nova-22.2.2/nova/tests/unit/compute/test_api.py --- nova-22.0.1/nova/tests/unit/compute/test_api.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/compute/test_api.py 2021-07-09 05:08:56.000000000 +0200 @@ -4211,7 +4211,7 @@ 'cores': 1 + instance.flavor.vcpus, 'ram': 512 + instance.flavor.memory_mb}, project_id=instance.project_id) - update_qfd.assert_called_once_with(admin_context, instance, False) + update_qfd.assert_called_once_with(admin_context, instance.uuid, False) @mock.patch('nova.objects.Quotas.get_all_by_project_and_user', new=mock.MagicMock()) @@ -4252,7 +4252,7 @@ 'cores': 1 + instance.flavor.vcpus, 'ram': 512 + instance.flavor.memory_mb}, project_id=instance.project_id) - update_qfd.assert_called_once_with(self.context, instance, False) + update_qfd.assert_called_once_with(self.context, instance.uuid, False) @mock.patch.object(objects.InstanceAction, 'action_start') def test_external_instance_event(self, mock_action_start): @@ -5971,7 +5971,8 @@ inst = objects.Instance(uuid=uuid) im = objects.InstanceMapping(instance_uuid=uuid) mock_get.return_value = im - self.compute_api._update_queued_for_deletion(self.context, inst, True) + self.compute_api._update_queued_for_deletion( + self.context, inst.uuid, True) self.assertTrue(im.queued_for_delete) mock_get.assert_called_once_with(self.context, inst.uuid) mock_save.assert_called_once_with() @@ -7522,7 +7523,8 @@ version is not new enough. """ instance = objects.Instance( - project_id='fake-project', user_id='fake-user') + project_id='fake-project', user_id='fake-user', + uuid=uuids.instance) with mock.patch.object(self.context, 'can', return_value=True) as can: self.assertFalse(self.compute_api._allow_cross_cell_resize( self.context, instance)) @@ -7530,20 +7532,45 @@ mock_get_min_ver.assert_called_once_with( self.context, ['nova-compute']) + @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance', + return_value=[objects.RequestGroup()]) @mock.patch('nova.objects.service.get_minimum_version_all_cells', return_value=compute_api.MIN_COMPUTE_CROSS_CELL_RESIZE) - def test_allow_cross_cell_resize_true(self, mock_get_min_ver): + def test_allow_cross_cell_resize_false_port_with_resource_req( + self, mock_get_min_ver, mock_get_res_req): + """Policy allows cross-cell resize but minimum nova-compute service + version is not new enough. + """ + instance = objects.Instance( + project_id='fake-project', user_id='fake-user', + uuid=uuids.instance) + with mock.patch.object(self.context, 'can', return_value=True) as can: + self.assertFalse(self.compute_api._allow_cross_cell_resize( + self.context, instance)) + can.assert_called_once() + mock_get_min_ver.assert_called_once_with( + self.context, ['nova-compute']) + mock_get_res_req.assert_called_once_with(self.context, uuids.instance) + + @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance', + return_value=[]) + @mock.patch('nova.objects.service.get_minimum_version_all_cells', + return_value=compute_api.MIN_COMPUTE_CROSS_CELL_RESIZE) + def test_allow_cross_cell_resize_true( + self, mock_get_min_ver, mock_get_res_req): """Policy allows cross-cell resize and minimum nova-compute service version is new enough. """ instance = objects.Instance( - project_id='fake-project', user_id='fake-user') + project_id='fake-project', user_id='fake-user', + uuid=uuids.instance) with mock.patch.object(self.context, 'can', return_value=True) as can: self.assertTrue(self.compute_api._allow_cross_cell_resize( self.context, instance)) can.assert_called_once() mock_get_min_ver.assert_called_once_with( self.context, ['nova-compute']) + mock_get_res_req.assert_called_once_with(self.context, uuids.instance) def _test_block_accelerators(self, instance, args_info, until_service=None): diff -Nru nova-22.0.1/nova/tests/unit/compute/test_compute_mgr.py nova-22.2.2/nova/tests/unit/compute/test_compute_mgr.py --- nova-22.0.1/nova/tests/unit/compute/test_compute_mgr.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/compute/test_compute_mgr.py 2021-07-09 05:08:56.000000000 +0200 @@ -1111,6 +1111,11 @@ "time this service is starting on this host, then you can ignore " "this warning.", 'fake-node1') + def test_init_host_disk_devices_configuration_failure(self): + self.flags(max_disk_devices_to_attach=0, group='compute') + self.assertRaises(exception.InvalidConfiguration, + self.compute.init_host) + @mock.patch.object(objects.InstanceList, 'get_by_host', new=mock.Mock()) @mock.patch('nova.compute.manager.ComputeManager.' @@ -3382,12 +3387,16 @@ CONF.host, instance.uuid, graceful_exit=False) return result + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_success(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_fail(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', @@ -3397,7 +3406,9 @@ self._test_check_can_live_migrate_destination, do_raise=True) - def test_check_can_live_migrate_destination_contins_vifs(self): + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) + def test_check_can_live_migrate_destination_contains_vifs(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) @@ -3405,6 +3416,8 @@ self.assertIn('vifs', migrate_data) self.assertIsNotNone(migrate_data.vifs) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_no_binding_extended(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', @@ -3412,18 +3425,40 @@ migrate_data = self._test_check_can_live_migrate_destination() self.assertNotIn('vifs', migrate_data) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_src_numa_lm_false(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination(src_numa_lm=False) + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_check_can_live_migrate_destination_src_numa_lm_true(self): self.useFixture(std_fixtures.MonkeyPatch( 'nova.network.neutron.API.supports_port_binding_extension', lambda *args: True)) self._test_check_can_live_migrate_destination(src_numa_lm=True) + @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') + def test_check_can_live_migrate_destination_fail_group_policy( + self, mock_fail_db): + + instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, vm_state=vm_states.ACTIVE, + node='fake-node') + + ex = exception.RescheduledException( + instance_uuid=instance.uuid, reason="policy violated") + + with mock.patch.object(self.compute, '_validate_instance_group_policy', + side_effect=ex): + self.assertRaises( + exception.MigrationPreCheckError, + self.compute.check_can_live_migrate_destination, + self.context, instance, None, None, None, None) + def test_dest_can_numa_live_migrate(self): positive_dest_check_data = objects.LibvirtLiveMigrateData( dst_supports_numa_live_migration=True) @@ -5340,19 +5375,21 @@ node = uuidutils.generate_uuid() # ironic node uuid instance = fake_instance.fake_instance_obj(self.context, node=node) instance.migration_context = None + migration = objects.Migration(status='accepted') with test.nested( mock.patch.object(self.compute, '_get_compute_info'), mock.patch.object(self.compute, '_do_rebuild_instance_with_claim'), mock.patch.object(objects.Instance, 'save'), - mock.patch.object(self.compute, '_set_migration_status'), - ) as (mock_get, mock_rebuild, mock_save, mock_set): + mock.patch.object(migration, 'save'), + ) as (mock_get, mock_rebuild, mock_save, mock_migration_save): self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, None, False, - False, False, None, None, {}, None, []) + False, False, migration, None, {}, None, []) self.assertFalse(mock_get.called) self.assertEqual(node, instance.node) - mock_set.assert_called_once_with(None, 'done') + self.assertEqual('done', migration.status) + mock_migration_save.assert_called_once_with() def test_rebuild_node_updated_if_recreate(self): dead_node = uuidutils.generate_uuid() @@ -5365,16 +5402,15 @@ with test.nested( mock.patch.object(self.compute, '_get_compute_info'), mock.patch.object(self.compute, '_do_rebuild_instance'), - mock.patch.object(objects.Instance, 'save'), - mock.patch.object(self.compute, '_set_migration_status'), - ) as (mock_get, mock_rebuild, mock_save, mock_set): + ) as (mock_get, mock_rebuild): mock_get.return_value.hypervisor_hostname = 'new-node' self.compute.rebuild_instance( self.context, instance, None, None, None, None, None, - None, True, False, False, None, None, {}, None, []) + None, True, False, False, mock.sentinel.migration, None, {}, + None, []) mock_get.assert_called_once_with(mock.ANY, self.compute.host) - self.assertEqual('new-node', instance.node) - mock_set.assert_called_once_with(None, 'done') + mock_rt.finish_evacuation.assert_called_once_with( + instance, 'new-node', mock.sentinel.migration) # Make sure the rebuild_claim was called with the proper image_meta # from the instance. mock_rt.rebuild_claim.assert_called_once() @@ -6875,14 +6911,13 @@ @mock.patch.object(objects.Instance, 'save') @mock.patch.object(manager.ComputeManager, '_nil_out_instance_obj_host_and_node') - @mock.patch.object(manager.ComputeManager, '_cleanup_volumes') @mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks') @mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state') @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') def test_rescheduled_exception_without_retry(self, - mock_build_run, mock_add, mock_set, mock_clean_net, mock_clean_vol, - mock_nil, mock_save, mock_start, mock_finish): + mock_build_run, mock_add, mock_set, mock_clean_net, mock_nil, + mock_save, mock_start, mock_finish): self._do_build_instance_update(mock_save) mock_build_run.side_effect = exception.RescheduledException(reason='', instance_uuid=self.instance.uuid) @@ -6918,9 +6953,6 @@ self.accel_uuids) mock_clean_net.assert_called_once_with(self.context, self.instance, self.requested_networks) - mock_clean_vol.assert_called_once_with(self.context, - self.instance, self.block_device_mapping, - raise_exc=False) mock_add.assert_called_once_with(self.context, self.instance, mock.ANY, mock.ANY, fault_message=mock.ANY) mock_nil.assert_called_once_with(self.instance) @@ -7530,7 +7562,8 @@ def test_validate_policy_honors_workaround_disabled(self, mock_get): instance = objects.Instance(uuid=uuids.instance) hints = {'group': 'foo'} - mock_get.return_value = objects.InstanceGroup(policy=None) + mock_get.return_value = objects.InstanceGroup(policy=None, + uuid=uuids.group) self.compute._validate_instance_group_policy(self.context, instance, hints) mock_get.assert_called_once_with(self.context, 'foo') @@ -7556,10 +7589,14 @@ instance, hints) mock_get.assert_called_once_with(self.context, uuids.group_hint) + @mock.patch('nova.objects.InstanceGroup.get_by_uuid') @mock.patch('nova.objects.InstanceList.get_uuids_by_host') @mock.patch('nova.objects.InstanceGroup.get_by_hint') - def test_validate_instance_group_policy_with_rules(self, mock_get_by_hint, - mock_get_by_host): + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch('nova.objects.MigrationList.get_in_progress_by_host_and_node') + def test_validate_instance_group_policy_with_rules( + self, migration_list, nodes, mock_get_by_hint, mock_get_by_host, + mock_get_by_uuid): # Create 2 instance in same host, inst2 created before inst1 instance = objects.Instance(uuid=uuids.inst1) hints = {'group': [uuids.group_hint]} @@ -7568,17 +7605,26 @@ mock_get_by_host.return_value = existing_insts # if group policy rules limit to 1, raise RescheduledException - mock_get_by_hint.return_value = objects.InstanceGroup( + group = objects.InstanceGroup( policy='anti-affinity', rules={'max_server_per_host': '1'}, - hosts=['host1'], members=members_uuids) + hosts=['host1'], members=members_uuids, + uuid=uuids.group) + mock_get_by_hint.return_value = group + mock_get_by_uuid.return_value = group + nodes.return_value = ['nodename'] + migration_list.return_value = [objects.Migration( + uuid=uuids.migration, instance_uuid=uuids.instance)] self.assertRaises(exception.RescheduledException, self.compute._validate_instance_group_policy, self.context, instance, hints) # if group policy rules limit change to 2, validate OK - mock_get_by_hint.return_value = objects.InstanceGroup( + group2 = objects.InstanceGroup( policy='anti-affinity', rules={'max_server_per_host': 2}, - hosts=['host1'], members=members_uuids) + hosts=['host1'], members=members_uuids, + uuid=uuids.group) + mock_get_by_hint.return_value = group2 + mock_get_by_uuid.return_value = group2 self.compute._validate_instance_group_policy(self.context, instance, hints) @@ -9105,6 +9151,8 @@ manager.ComputeManager() mock_executor.assert_called_once_with() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_cinder_v3_api(self): # This tests that pre_live_migration with a bdm with an # attachment_id, will create a new attachment and update @@ -9182,6 +9230,8 @@ _test() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_exception_cinder_v3_api(self): # The instance in this test has 2 attachments. The second attach_create # will throw an exception. This will test that the first attachment @@ -9251,6 +9301,8 @@ self.assertGreater(len(m.mock_calls), 0) _test() + @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( + side_effect=exception.InstanceGroupNotFound(group_uuid=''))) def test_pre_live_migration_exceptions_delete_attachments(self): # The instance in this test has 2 attachments. The call to # driver.pre_live_migration will raise an exception. This will test @@ -9319,12 +9371,18 @@ """Tests the various ways that _get_neutron_events_for_live_migration will return an empty list. """ + migration = mock.Mock() + migration.is_same_host = lambda: False + self.assertFalse(migration.is_same_host()) + # 1. no timeout self.flags(vif_plugging_timeout=0) with mock.patch.object(self.instance, 'get_network_info') as nw_info: nw_info.return_value = network_model.NetworkInfo( - [network_model.VIF(uuids.port1)]) + [network_model.VIF(uuids.port1, details={ + network_model.VIF_DETAILS_OVS_HYBRID_PLUG: True})]) + self.assertTrue(nw_info.return_value[0].is_hybrid_plug_enabled()) self.assertEqual( [], self.compute._get_neutron_events_for_live_migration( self.instance)) @@ -9333,7 +9391,18 @@ self.flags(vif_plugging_timeout=300) with mock.patch.object(self.instance, 'get_network_info') as nw_info: - nw_info.return_value = [] + nw_info.return_value = network_model.NetworkInfo([]) + self.assertEqual( + [], self.compute._get_neutron_events_for_live_migration( + self.instance)) + + # 3. no plug time events + with mock.patch.object(self.instance, 'get_network_info') as nw_info: + nw_info.return_value = network_model.NetworkInfo( + [network_model.VIF( + uuids.port1, details={ + network_model.VIF_DETAILS_OVS_HYBRID_PLUG: False})]) + self.assertFalse(nw_info.return_value[0].is_hybrid_plug_enabled()) self.assertEqual( [], self.compute._get_neutron_events_for_live_migration( self.instance)) @@ -9351,9 +9420,11 @@ wait_for_vif_plugged=True) mock_get_bdms.return_value = objects.BlockDeviceMappingList(objects=[]) mock_pre_live_mig.return_value = migrate_data + details = {network_model.VIF_DETAILS_OVS_HYBRID_PLUG: True} self.instance.info_cache = objects.InstanceInfoCache( network_info=network_model.NetworkInfo([ - network_model.VIF(uuids.port1), network_model.VIF(uuids.port2) + network_model.VIF(uuids.port1, details=details), + network_model.VIF(uuids.port2, details=details) ])) self.compute._waiting_live_migrations[self.instance.uuid] = ( self.migration, mock.MagicMock() @@ -9383,11 +9454,12 @@ of not waiting. """ migrate_data = objects.LibvirtLiveMigrateData() + details = {network_model.VIF_DETAILS_OVS_HYBRID_PLUG: True} mock_get_bdms.return_value = objects.BlockDeviceMappingList(objects=[]) mock_pre_live_mig.return_value = migrate_data self.instance.info_cache = objects.InstanceInfoCache( network_info=network_model.NetworkInfo([ - network_model.VIF(uuids.port1)])) + network_model.VIF(uuids.port1, details=details)])) self.compute._waiting_live_migrations[self.instance.uuid] = ( self.migration, mock.MagicMock() ) @@ -9531,9 +9603,11 @@ mock_get_bdms.return_value = source_bdms migrate_data = objects.LibvirtLiveMigrateData( wait_for_vif_plugged=True) + details = {network_model.VIF_DETAILS_OVS_HYBRID_PLUG: True} self.instance.info_cache = objects.InstanceInfoCache( network_info=network_model.NetworkInfo([ - network_model.VIF(uuids.port1), network_model.VIF(uuids.port2) + network_model.VIF(uuids.port1, details=details), + network_model.VIF(uuids.port2, details=details) ])) self.compute._waiting_live_migrations = {} fake_migration = objects.Migration( @@ -10603,6 +10677,54 @@ self.context, instance.image_meta, instance, instance.flavor, request_spec, filter_properties={}, node=instance.node, clean_shutdown=True, migration=migration, host_list=[]) + # The instance.vm_state should remain unchanged + # (_error_out_instance_on_exception will set to ACTIVE by default). + self.assertEqual(vm_states.STOPPED, instance.vm_state) + + @mock.patch('nova.compute.utils.notify_usage_exists') + @mock.patch('nova.compute.manager.ComputeManager.' + '_notify_about_instance_usage') + @mock.patch('nova.compute.utils.notify_about_resize_prep_instance') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager._revert_allocation') + @mock.patch('nova.compute.manager.ComputeManager.' + '_reschedule_resize_or_reraise') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + # this is almost copy-paste from test_prep_resize_fails_rollback + def test_prep_resize_fails_group_validation( + self, add_instance_fault_from_exc, _reschedule_resize_or_reraise, + _revert_allocation, mock_instance_save, + notify_about_resize_prep_instance, _notify_about_instance_usage, + notify_usage_exists): + """Tests that if _validate_instance_group_policy raises + InstanceFaultRollback, the instance.vm_state is reset properly in + _error_out_instance_on_exception + """ + instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, vm_state=vm_states.STOPPED, + node='fake-node', expected_attrs=['system_metadata', 'flavor']) + migration = mock.MagicMock(spec='nova.objects.Migration') + request_spec = mock.MagicMock(spec='nova.objects.RequestSpec') + ex = exception.RescheduledException( + instance_uuid=instance.uuid, reason="policy violated") + ex2 = exception.InstanceFaultRollback( + inner_exception=ex) + + def fake_reschedule_resize_or_reraise(*args, **kwargs): + raise ex2 + + _reschedule_resize_or_reraise.side_effect = ( + fake_reschedule_resize_or_reraise) + + with mock.patch.object(self.compute, '_validate_instance_group_policy', + side_effect=ex): + self.assertRaises( + # _error_out_instance_on_exception should reraise the + # RescheduledException inside InstanceFaultRollback. + exception.RescheduledException, self.compute.prep_resize, + self.context, instance.image_meta, instance, instance.flavor, + request_spec, filter_properties={}, node=instance.node, + clean_shutdown=True, migration=migration, host_list=[]) # The instance.vm_state should remain unchanged # (_error_out_instance_on_exception will set to ACTIVE by default). self.assertEqual(vm_states.STOPPED, instance.vm_state) diff -Nru nova-22.0.1/nova/tests/unit/db/test_db_api.py nova-22.2.2/nova/tests/unit/db/test_db_api.py --- nova-22.0.1/nova/tests/unit/db/test_db_api.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/db/test_db_api.py 2021-07-09 05:08:56.000000000 +0200 @@ -39,7 +39,6 @@ import six from six.moves import range from sqlalchemy import Column -from sqlalchemy.dialects import sqlite from sqlalchemy.exc import OperationalError from sqlalchemy.exc import SQLAlchemyError from sqlalchemy import inspect @@ -1693,6 +1692,14 @@ sys_meta = utils.metadata_to_dict(inst['system_metadata']) self.assertEqual(sys_meta, self.sample_data['system_metadata']) + def test_instance_get_with_meta(self): + inst_id = self.create_instance_with_args().id + inst = db.instance_get(self.ctxt, inst_id) + meta = utils.metadata_to_dict(inst['metadata']) + self.assertEqual(meta, self.sample_data['metadata']) + sys_meta = utils.metadata_to_dict(inst['system_metadata']) + self.assertEqual(sys_meta, self.sample_data['system_metadata']) + def test_instance_update(self): instance = self.create_instance_with_args() metadata = {'host': 'bar', 'key2': 'wuff'} @@ -6288,17 +6295,24 @@ instance_actions_events=1) self._assertEqualObjects(expected, results[0]) - # Archive 1 row deleted before 2017-01-03. instance_action_events - # should be the table with row deleted due to FK contraints + # Archive 1 row deleted before 2017-01-03 + # Because the instances table will be processed first, tables that + # refer to it (instance_actions and instance_action_events) will be + # visited and archived in the same transaction as the instance, to + # avoid orphaning the instance record (archive dependent records in one + # transaction) before_date = dateutil_parser.parse('2017-01-03', fuzzy=True) results = db.archive_deleted_rows(max_rows=1, before=before_date) - expected = dict(instance_actions_events=1) + expected = dict(instances=1, + instance_actions=1, + instance_actions_events=1) self._assertEqualObjects(expected, results[0]) - # Archive all other rows deleted before 2017-01-03. This should - # delete row in instance_actions, then row in instances due to FK - # constraints + # Try to archive all other rows deleted before 2017-01-03. This should + # not archive anything because the instances table and tables that + # refer to it (instance_actions and instance_action_events) were all + # archived in the last run. results = db.archive_deleted_rows(max_rows=100, before=before_date) - expected = dict(instances=1, instance_actions=1) + expected = {} self._assertEqualObjects(expected, results[0]) # Verify we have 4 left in main @@ -6432,18 +6446,7 @@ def _check_sqlite_version_less_than_3_7(self): # SQLite doesn't enforce foreign key constraints without a pragma. - dialect = self.engine.url.get_dialect() - if dialect == sqlite.dialect: - # We're seeing issues with foreign key support in SQLite 3.6.20 - # SQLAlchemy doesn't support it at all with < SQLite 3.6.19 - # It works fine in SQLite 3.7. - # So return early to skip this test if running SQLite < 3.7 - import sqlite3 - tup = sqlite3.sqlite_version_info - if tup[0] < 3 or (tup[0] == 3 and tup[1] < 7): - self.skipTest( - 'sqlite version too old for reliable SQLA foreign_keys') - self.conn.execute("PRAGMA foreign_keys = ON") + self.enforce_fk_constraints(engine=self.engine) def test_archive_deleted_rows_for_migrations(self): # migrations.instance_uuid depends on instances.uuid @@ -6457,19 +6460,8 @@ ins_stmt = self.migrations.insert().values(instance_uuid=instance_uuid, deleted=0) self.conn.execute(ins_stmt) - # The first try to archive instances should fail, due to FK. - num = sqlalchemy_api._archive_deleted_rows_for_table(self.metadata, - "instances", - max_rows=None, - before=None) - self.assertEqual(0, num[0]) - # Then archiving migrations should work. - num = sqlalchemy_api._archive_deleted_rows_for_table(self.metadata, - "migrations", - max_rows=None, - before=None) - self.assertEqual(1, num[0]) - # Then archiving instances should work. + # Archiving instances should result in migrations related to the + # instances also being archived. num = sqlalchemy_api._archive_deleted_rows_for_table(self.metadata, "instances", max_rows=None, diff -Nru nova-22.0.1/nova/tests/unit/network/test_neutron.py nova-22.2.2/nova/tests/unit/network/test_neutron.py --- nova-22.0.1/nova/tests/unit/network/test_neutron.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/network/test_neutron.py 2021-07-09 05:08:56.000000000 +0200 @@ -253,6 +253,8 @@ auth_token='token') cl = neutronapi.get_client(my_context) self.assertEqual(retries, cl.httpclient.connect_retries) + kcl = neutronapi._get_ksa_client(my_context) + self.assertEqual(retries, kcl.connect_retries) class TestAPIBase(test.TestCase): diff -Nru nova-22.0.1/nova/tests/unit/pci/test_manager.py nova-22.2.2/nova/tests/unit/pci/test_manager.py --- nova-22.0.1/nova/tests/unit/pci/test_manager.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/pci/test_manager.py 2021-07-09 05:08:56.000000000 +0200 @@ -236,6 +236,40 @@ tracker.update_devices_from_hypervisor_resources(fake_pci_devs_json) self.assertEqual(2, len(tracker.pci_devs)) + @mock.patch("nova.pci.manager.LOG.debug") + def test_update_devices_from_hypervisor_resources_32bit_domain( + self, mock_debug): + self.flags( + group='pci', + passthrough_whitelist=[ + '{"product_id":"2032", "vendor_id":"8086"}']) + # There are systems where 32 bit PCI domain is used. See bug 1897528 + # for example. While nova (and qemu) does not support assigning such + # devices but the existence of such device in the system should not + # lead to an error. + fake_pci = { + 'compute_node_id': 1, + 'address': '10000:00:02.0', + 'product_id': '2032', + 'vendor_id': '8086', + 'request_id': None, + 'status': fields.PciDeviceStatus.AVAILABLE, + 'dev_type': fields.PciDeviceType.STANDARD, + 'parent_addr': None, + 'numa_node': 0} + + fake_pci_devs = [fake_pci] + fake_pci_devs_json = jsonutils.dumps(fake_pci_devs) + tracker = manager.PciDevTracker(self.fake_context) + # We expect that the device with 32bit PCI domain is ignored + tracker.update_devices_from_hypervisor_resources(fake_pci_devs_json) + self.assertEqual(0, len(tracker.pci_devs)) + mock_debug.assert_called_once_with( + 'Skipping PCI device %s reported by the hypervisor: %s', + {'address': '10000:00:02.0', 'parent_addr': None}, + 'The property domain (10000) is greater than the maximum ' + 'allowable value (FFFF).') + def test_set_hvdev_new_dev(self): fake_pci_3 = dict(fake_pci, address='0000:00:00.4', vendor_id='v2') fake_pci_devs = [copy.deepcopy(fake_pci), copy.deepcopy(fake_pci_1), diff -Nru nova-22.0.1/nova/tests/unit/storage/test_rbd.py nova-22.2.2/nova/tests/unit/storage/test_rbd.py --- nova-22.0.1/nova/tests/unit/storage/test_rbd.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/storage/test_rbd.py 2021-07-09 05:08:56.000000000 +0200 @@ -13,6 +13,7 @@ from eventlet import tpool import mock +from oslo_concurrency import processutils from oslo_serialization import jsonutils from oslo_utils.fixture import uuidsentinel as uuids @@ -110,7 +111,7 @@ self.rbd_pool = 'rbd' self.rbd_connect_timeout = 5 self.flags( - images_rbd_pool=self.images_rbd_pool, + images_rbd_pool=self.rbd_pool, images_rbd_ceph_conf='/foo/bar.conf', rbd_connect_timeout=self.rbd_connect_timeout, rbd_user='foo', group='libvirt') @@ -653,6 +654,11 @@ 'used': ceph_df_json['pools'][1]['stats']['bytes_used']} self.assertDictEqual(expected, self.driver.get_pool_info()) + @mock.patch('oslo_concurrency.processutils.execute', autospec=True, + side_effect=processutils.ProcessExecutionError("failed")) + def test_get_pool_info_execute_failed(self, mock_execute): + self.assertRaises(exception.StorageError, self.driver.get_pool_info) + @mock.patch('oslo_concurrency.processutils.execute') def test_get_pool_info_not_found(self, mock_execute): # Make the pool something other than self.rbd_pool so it won't be found diff -Nru nova-22.0.1/nova/tests/unit/test_fixtures.py nova-22.2.2/nova/tests/unit/test_fixtures.py --- nova-22.0.1/nova/tests/unit/test_fixtures.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/test_fixtures.py 2021-07-09 05:08:56.000000000 +0200 @@ -90,6 +90,7 @@ class TestOSAPIFixture(testtools.TestCase): @mock.patch('nova.objects.Service.get_by_host_and_binary') @mock.patch('nova.objects.Service.create') + @mock.patch('nova.utils.raise_if_old_compute', new=mock.Mock()) def test_responds_to_version(self, mock_service_create, mock_get): """Ensure the OSAPI server responds to calls sensibly.""" self.useFixture(output.CaptureOutput()) diff -Nru nova-22.0.1/nova/tests/unit/test_hacking.py nova-22.2.2/nova/tests/unit/test_hacking.py --- nova-22.0.1/nova/tests/unit/test_hacking.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/test_hacking.py 2021-07-09 05:08:56.000000000 +0200 @@ -897,7 +897,7 @@ expected_errors=errors, filename="nova/tests/unit/test_context.py") # Check no errors in other than 'nova/tests' directory. self._assert_has_no_errors( - code, checks.nonexistent_assertion_methods_and_attributes, + code, checks.useless_assertion, filename="nova/compute/api.py") code = """ self.assertIsNone(None_test_var, "Fails") diff -Nru nova-22.0.1/nova/tests/unit/test_metadata.py nova-22.2.2/nova/tests/unit/test_metadata.py --- nova-22.0.1/nova/tests/unit/test_metadata.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/test_metadata.py 2021-07-09 05:08:56.000000000 +0200 @@ -322,12 +322,14 @@ 'uuid': 'e5fe5518-0288-4fa3-b0c4-c79764101b85', 'root_device_name': None, 'default_ephemeral_device': None, - 'default_swap_device': None}) + 'default_swap_device': None, + 'context': self.context}) instance_ref1 = objects.Instance(**{'id': 0, 'uuid': 'b65cee2f-8c69-4aeb-be2f-f79742548fc2', 'root_device_name': '/dev/sda1', 'default_ephemeral_device': None, - 'default_swap_device': None}) + 'default_swap_device': None, + 'context': self.context}) def fake_bdm_get(ctxt, uuid): return [fake_block_device.FakeDbBlockDeviceDict( @@ -366,10 +368,12 @@ 'swap': '/dev/sdc', 'ebs0': '/dev/sdh'} - self.assertEqual(base._format_instance_mapping(self.context, - instance_ref0), block_device._DEFAULT_MAPPINGS) - self.assertEqual(base._format_instance_mapping(self.context, - instance_ref1), expected) + self.assertEqual( + base._format_instance_mapping(instance_ref0), + block_device._DEFAULT_MAPPINGS) + self.assertEqual( + base._format_instance_mapping(instance_ref1), + expected) def test_pubkey(self): md = fake_InstanceMetadata(self, self.instance.obj_clone()) diff -Nru nova-22.0.1/nova/tests/unit/test_service.py nova-22.2.2/nova/tests/unit/test_service.py --- nova-22.0.1/nova/tests/unit/test_service.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/test_service.py 2021-07-09 05:08:56.000000000 +0200 @@ -268,6 +268,24 @@ serv.reset() mock_reset.assert_called_once_with() + @mock.patch('nova.conductor.api.API.wait_until_ready') + @mock.patch('nova.utils.raise_if_old_compute') + def test_old_compute_version_check_happens_after_wait_for_conductor( + self, mock_check_old, mock_wait): + obj_base.NovaObject.indirection_api = mock.MagicMock() + + def fake_wait(*args, **kwargs): + mock_check_old.assert_not_called() + + mock_wait.side_effect = fake_wait + + service.Service.create( + self.host, self.binary, self.topic, + 'nova.tests.unit.test_service.FakeManager') + + mock_check_old.assert_called_once_with() + mock_wait.assert_called_once_with(mock.ANY) + class TestWSGIService(test.NoDBTestCase): diff -Nru nova-22.0.1/nova/tests/unit/test_utils.py nova-22.2.2/nova/tests/unit/test_utils.py --- nova-22.0.1/nova/tests/unit/test_utils.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/test_utils.py 2021-07-09 05:08:56.000000000 +0200 @@ -37,6 +37,7 @@ from nova import exception from nova.objects import base as obj_base from nova.objects import instance as instance_obj +from nova.objects import service as service_obj from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.unit.objects import test_objects @@ -1207,3 +1208,187 @@ self.mock_get_confgrp.assert_called_once_with(self.service_type) self.mock_connection.assert_not_called() self.mock_get_auth_sess.assert_not_called() + + +class TestOldComputeCheck(test.NoDBTestCase): + + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_no_compute(self, mock_get_min_service): + mock_get_min_service.return_value = 0 + + utils.raise_if_old_compute() + + mock_get_min_service.assert_called_once_with( + mock.ANY, ['nova-compute']) + + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_old_but_supported_compute(self, mock_get_min_service): + oldest = service_obj.SERVICE_VERSION_ALIASES[ + service_obj.OLDEST_SUPPORTED_SERVICE_VERSION] + mock_get_min_service.return_value = oldest + + utils.raise_if_old_compute() + + mock_get_min_service.assert_called_once_with( + mock.ANY, ['nova-compute']) + + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_new_compute(self, mock_get_min_service): + mock_get_min_service.return_value = service_obj.SERVICE_VERSION + + utils.raise_if_old_compute() + + mock_get_min_service.assert_called_once_with( + mock.ANY, ['nova-compute']) + + @mock.patch('nova.objects.service.Service.get_minimum_version') + def test_too_old_compute_cell(self, mock_get_min_service): + self.flags(group='api_database', connection=None) + oldest = service_obj.SERVICE_VERSION_ALIASES[ + service_obj.OLDEST_SUPPORTED_SERVICE_VERSION] + mock_get_min_service.return_value = oldest - 1 + + ex = self.assertRaises( + exception.TooOldComputeService, utils.raise_if_old_compute) + + self.assertIn('cell', str(ex)) + mock_get_min_service.assert_called_once_with(mock.ANY, 'nova-compute') + + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_too_old_compute_top_level(self, mock_get_min_service): + self.flags(group='api_database', connection='fake db connection') + oldest = service_obj.SERVICE_VERSION_ALIASES[ + service_obj.OLDEST_SUPPORTED_SERVICE_VERSION] + mock_get_min_service.return_value = oldest - 1 + + ex = self.assertRaises( + exception.TooOldComputeService, utils.raise_if_old_compute) + + self.assertIn('system', str(ex)) + mock_get_min_service.assert_called_once_with( + mock.ANY, ['nova-compute']) + + @mock.patch.object(utils.LOG, 'warning') + @mock.patch('nova.objects.service.Service.get_minimum_version') + @mock.patch('nova.objects.service.get_minimum_version_all_cells') + def test_api_db_is_configured_but_the_service_cannot_access_db( + self, mock_get_all, mock_get, mock_warn): + # This is the case when the nova-compute service is wrongly configured + # with db credentials but nova-compute is never allowed to access the + # db directly. + mock_get_all.side_effect = exception.DBNotAllowed( + binary='nova-compute') + + oldest = service_obj.SERVICE_VERSION_ALIASES[ + service_obj.OLDEST_SUPPORTED_SERVICE_VERSION] + mock_get.return_value = oldest - 1 + + ex = self.assertRaises( + exception.TooOldComputeService, utils.raise_if_old_compute) + + self.assertIn('cell', str(ex)) + mock_get_all.assert_called_once_with(mock.ANY, ['nova-compute']) + mock_get.assert_called_once_with(mock.ANY, 'nova-compute') + mock_warn.assert_called_once_with( + 'This service is configured for access to the API database but is ' + 'not allowed to directly access the database. You should run this ' + 'service without the [api_database]/connection config option. The ' + 'service version check will only query the local cell.') + + +class RunOnceTests(test.NoDBTestCase): + + fake_logger = mock.MagicMock() + + @utils.run_once("already ran once", fake_logger) + def dummy_test_func(self, fail=False): + if fail: + raise ValueError() + return True + + def setUp(self): + super(RunOnceTests, self).setUp() + self.dummy_test_func.reset() + RunOnceTests.fake_logger.reset_mock() + + def test_wrapped_funtions_called_once(self): + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + self.assertTrue(self.dummy_test_func.called) + + # assert that on second invocation no result + # is returned and that the logger is invoked. + result = self.dummy_test_func() + RunOnceTests.fake_logger.assert_called_once() + self.assertIsNone(result) + + def test_wrapped_funtions_called_once_raises(self): + self.assertFalse(self.dummy_test_func.called) + self.assertRaises(ValueError, self.dummy_test_func, fail=True) + self.assertTrue(self.dummy_test_func.called) + + # assert that on second invocation no result + # is returned and that the logger is invoked. + result = self.dummy_test_func() + RunOnceTests.fake_logger.assert_called_once() + self.assertIsNone(result) + + def test_wrapped_funtions_can_be_reset(self): + # assert we start with a clean state + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + + self.dummy_test_func.reset() + # assert we restored a clean state + self.assertFalse(self.dummy_test_func.called) + result = self.dummy_test_func() + self.assertTrue(result) + + # assert that we never called the logger + RunOnceTests.fake_logger.assert_not_called() + + def test_reset_calls_cleanup(self): + mock_clean = mock.Mock() + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + f() + self.assertTrue(f.called) + + f.reset() + self.assertFalse(f.called) + mock_clean.assert_called_once_with() + + def test_clean_is_not_called_at_reset_if_wrapped_not_called(self): + mock_clean = mock.Mock() + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + self.assertFalse(f.called) + + f.reset() + self.assertFalse(f.called) + self.assertFalse(mock_clean.called) + + def test_reset_works_even_if_cleanup_raises(self): + mock_clean = mock.Mock(side_effect=ValueError()) + + @utils.run_once("already ran once", self.fake_logger, + cleanup=mock_clean) + def f(): + pass + + f() + self.assertTrue(f.called) + + self.assertRaises(ValueError, f.reset) + self.assertFalse(f.called) + mock_clean.assert_called_once_with() diff -Nru nova-22.0.1/nova/tests/unit/virt/disk/vfs/fakeguestfs.py nova-22.2.2/nova/tests/unit/virt/disk/vfs/fakeguestfs.py --- nova-22.0.1/nova/tests/unit/virt/disk/vfs/fakeguestfs.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/virt/disk/vfs/fakeguestfs.py 2021-07-09 05:08:56.000000000 +0200 @@ -109,7 +109,7 @@ "mode": 0o700 } - return self.files[path]["content"] + return bytes(self.files[path]["content"].encode()) def write(self, path, content): if path not in self.files: diff -Nru nova-22.0.1/nova/tests/unit/virt/libvirt/test_driver.py nova-22.2.2/nova/tests/unit/virt/libvirt/test_driver.py --- nova-22.0.1/nova/tests/unit/virt/libvirt/test_driver.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/tests/unit/virt/libvirt/test_driver.py 2021-07-09 05:08:56.000000000 +0200 @@ -9136,6 +9136,9 @@ 'encryption_key_id': uuids.encryption_key_id} instance = mock.sentinel.instance + # Mock out find_secret so we don't skip ahead + drvr._host.find_secret.return_value = None + # Mock out the encryptors mock_encryptor = mock.Mock() mock_get_volume_encryptor.return_value = mock_encryptor @@ -10017,6 +10020,8 @@ 'serial': uuids.volume_id, 'driver_volume_type': 'rbd', 'data': {'name': 'pool/volume', + 'auth_enabled': 'true', + 'auth_username': 'username', 'access_mode': 'rw'} } disk_1 = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk, @@ -10058,7 +10063,8 @@ mock_get_encryption_metadata.assert_called_once_with( self.context, drvr._volume_api, uuids.volume_id, connection_info) - mock_qemu_img_info.assert_called_once_with('rbd:pool/volume') + mock_qemu_img_info.assert_called_once_with( + 'rbd:pool/volume:id=username') # Assert that the Libvirt call to resize the device within the instance # is called with the LUKSv1 payload offset taken into account. @@ -10196,6 +10202,21 @@ crt_scrt.assert_called_once_with( 'volume', uuids.serial, password=key) + @mock.patch.object(key_manager, 'API') + def test_attach_encryptor_secret_exists(self, mock_key_manager_api): + connection_info = {'data': {'volume_id': uuids.volume_id}} + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + with test.nested( + mock.patch.object(drvr, '_get_volume_encryption'), + mock.patch.object(drvr._host, 'find_secret') + ) as (mock_get_volume_encryption, mock_find_secret): + drvr._attach_encryptor(self.context, connection_info, None) + + # Assert we called find_secret and nothing else + mock_find_secret.assert_called_once_with('volume', uuids.volume_id) + mock_get_volume_encryption.assert_not_called() + mock_key_manager_api.assert_not_called() + @mock.patch('os_brick.encryptors.get_encryption_metadata') @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_volume_encryptor') def test_detach_encryptor_connection_info_incomplete(self, @@ -21177,6 +21198,54 @@ for trait in ['HW_CPU_X86_AVX512F', 'HW_CPU_X86_BMI']: self.assertIn(trait, self.pt.data(self.cn_rp['uuid']).traits) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_gpu_inventories') + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_get_cpu_feature_traits', + new=mock.Mock(return_value=cpu_traits)) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_local_gb_info', + new=mock.Mock(return_value={'total': 0})) + @mock.patch('nova.virt.libvirt.host.Host.get_memory_mb_total', + new=mock.Mock(return_value=0)) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_pcpu_available', + new=mock.Mock(return_value=range(0))) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vcpu_available', + new=mock.Mock(return_value=range(0))) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_update_provider_tree_for_pcpu', + new=mock.Mock()) + def test_update_provider_tree_zero_total(self, mock_gpu_invs): + # Verify that we omit various resources from inventory when there are + # zero total quantity of those resources. Placement does not allow + # inventory updates with total=0 as they fail API schema validation. + + # Use total=0 for vgpus. + gpu_inventory_dicts = { + 'pci_0000_06_00_0': {'total': 0, + 'max_unit': 16, + 'min_unit': 1, + 'step_size': 1, + 'reserved': 0, + 'allocation_ratio': 1.0, + }, + } + mock_gpu_invs.return_value = gpu_inventory_dicts + # Use an empty list for vpmems. + self.driver._vpmems_by_rc = {'CUSTOM_PMEM_NAMESPACE_4GB': []} + # Before we update_provider_tree, we have 2 providers from setUp(): + # self.cn_rp and self.shared_rp and they are both empty {}. + self.assertEqual(2, len(self.pt.get_provider_uuids())) + # Update the provider tree. + self.driver.update_provider_tree(self.pt, self.cn_rp['name']) + # After we update_provider_tree, we should still have 2 providers + # because VGPU has total=0 and we would skip adding a child provider + # for it. + self.assertEqual(2, len(self.pt.get_provider_uuids())) + # All providers should have an empty dict because (1) we never updated + # the self.shared_rp provider and (2) the other 2 providers have zero + # for resource totals. + for uuid in self.pt.get_provider_uuids(): + self.assertEqual({}, self.pt.data(uuid).inventory) + def test_update_provider_tree_with_vgpus(self): pci_devices = ['pci_0000_06_00_0', 'pci_0000_07_00_0'] gpu_inventory_dicts = { @@ -24919,6 +24988,20 @@ libvirt_driver.LibvirtDriver, fake.FakeVirtAPI(), False) + @mock.patch.object(nova.conf.devices, 'register_dynamic_opts') + def test_get_supported_vgpu_types_registering_dynamic_opts(self, rdo): + self.flags(enabled_vgpu_types=['nvidia-11', 'nvidia-12'], + group='devices') + + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + drvr._get_supported_vgpu_types() + + # Okay below is confusing, but remember, ._get_supported_vgpu_types() + # is first called by the LibvirtDriver object creation, so when + # calling the above drvr._get_supported_vgpu_types() method, it will + # be the second time that register_dynamic_opts() will be called. + rdo.assert_has_calls([mock.call(CONF), mock.call(CONF)]) + def test_get_vgpu_type_per_pgpu(self): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) device = 'pci_0000_84_00_0' @@ -26587,8 +26670,23 @@ not running should trigger a blockRebase using qemu-img not libvirt. In this test, we rebase the image with another image as backing file. """ + dom_xml = """ + + + + + + 0e38683e-f0af-418f-a3f1-6b67ea0f919d + + + + + + + """ % self.inst['uuid'] + mock_domain, guest = self._setup_block_rebase_domain_and_guest_mocks( - self.dom_xml) + dom_xml) instance = objects.Instance(**self.inst) snapshot_id = 'snapshot-1234' @@ -26599,10 +26697,13 @@ self.delete_info_1) mock_disk_op_sema.__enter__.assert_called_once() - mock_qemu_img_info.assert_called_once_with("snap.img") - mock_execute.assert_called_once_with('qemu-img', 'rebase', - '-b', 'snap.img', '-F', - 'fake_fmt', 'disk1_file') + mock_qemu_img_info.assert_called_once_with( + "/var/lib/nova/instances/%s/snap.img" % instance.uuid) + mock_execute.assert_called_once_with( + 'qemu-img', 'rebase', + '-b', '/var/lib/nova/instances/%s/snap.img' % instance.uuid, + '-F', 'fake_fmt', + '/var/lib/nova/instances/%s/disk1_file' % instance.uuid) @mock.patch.object(compute_utils, 'disk_ops_semaphore') @mock.patch.object(host.Host, "has_min_version", @@ -27549,6 +27650,15 @@ self.assertRaises(exception.PMEMNamespaceConfigInvalid, drvr._discover_vpmems, vpmem_conf) + @mock.patch('nova.privsep.libvirt.get_pmem_namespaces') + def test_get_vpmems_on_host__exception(self, mock_get_ns): + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) + mock_get_ns.side_effect = Exception('foo') + + self.assertRaises( + exception.GetPMEMNamespacesFailed, + drvr._get_vpmems_on_host) + @mock.patch('nova.virt.hardware.get_vpmems') def test_get_ordered_vpmems(self, mock_labels): # get orgered vpmems based on flavor extra_specs diff -Nru nova-22.0.1/nova/utils.py nova-22.2.2/nova/utils.py --- nova-22.0.1/nova/utils.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/utils.py 2021-07-09 05:08:56.000000000 +0200 @@ -1053,3 +1053,99 @@ norm_name = norm_name.upper() norm_name = orc.CUSTOM_NAMESPACE + norm_name return norm_name + + +def raise_if_old_compute(): + # to avoid circular imports + from nova import context as nova_context + from nova.objects import service + + ctxt = nova_context.get_admin_context() + + if CONF.api_database.connection is not None: + scope = 'system' + try: + current_service_version = service.get_minimum_version_all_cells( + ctxt, ['nova-compute']) + except exception.DBNotAllowed: + # This most likely means we are in a nova-compute service + # which is configured with a connection to the API database. + # We should not be attempting to "get out" of our cell to + # look at the minimum versions of nova-compute services in other + # cells, so DBNotAllowed was raised. Leave a warning message + # and fall back to only querying computes in our cell. + LOG.warning( + 'This service is configured for access to the API database ' + 'but is not allowed to directly access the database. You ' + 'should run this service without the ' + '[api_database]/connection config option. The service version ' + 'check will only query the local cell.') + scope = 'cell' + current_service_version = service.Service.get_minimum_version( + ctxt, 'nova-compute') + else: + scope = 'cell' + # We in a cell so target our query to the current cell only + current_service_version = service.Service.get_minimum_version( + ctxt, 'nova-compute') + + if current_service_version == 0: + # 0 means no compute in the system, + # probably a fresh install before the computes are registered + return + + oldest_supported_service_level = service.SERVICE_VERSION_ALIASES[ + service.OLDEST_SUPPORTED_SERVICE_VERSION] + + if current_service_version < oldest_supported_service_level: + raise exception.TooOldComputeService( + oldest_supported_version=service.OLDEST_SUPPORTED_SERVICE_VERSION, + scope=scope, + min_service_level=current_service_version, + oldest_supported_service=oldest_supported_service_level) + + +def run_once(message, logger, cleanup=None): + """This is a utility function decorator to ensure a function + is run once and only once in an interpreter instance. + + Note: this is copied from the placement repo (placement/util.py) + + The decorated function object can be reset by calling its + reset function. All exceptions raised by the wrapped function, + logger and cleanup function will be propagated to the caller. + """ + def outer_wrapper(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not wrapper.called: + # Note(sean-k-mooney): the called state is always + # updated even if the wrapped function completes + # by raising an exception. If the caller catches + # the exception it is their responsibility to call + # reset if they want to re-execute the wrapped function. + try: + return func(*args, **kwargs) + finally: + wrapper.called = True + else: + logger(message) + + wrapper.called = False + + def reset(wrapper, *args, **kwargs): + # Note(sean-k-mooney): we conditionally call the + # cleanup function if one is provided only when the + # wrapped function has been called previously. We catch + # and reraise any exception that may be raised and update + # the called state in a finally block to ensure its + # always updated if reset is called. + try: + if cleanup and wrapper.called: + return cleanup(*args, **kwargs) + finally: + wrapper.called = False + + wrapper.reset = functools.partial(reset, wrapper) + return wrapper + return outer_wrapper diff -Nru nova-22.0.1/nova/virt/disk/api.py nova-22.2.2/nova/virt/disk/api.py --- nova-22.0.1/nova/virt/disk/api.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/virt/disk/api.py 2021-07-09 05:08:56.000000000 +0200 @@ -615,8 +615,8 @@ :param username: the username :param admin_passwd: the admin password - :param passwd_data: path to the passwd file - :param shadow_data: path to the shadow password file + :param passwd_data: Data from the passwd file decoded as a string + :param shadow_data: Data from the shadow file decoded as a string :returns: nothing :raises: exception.NovaException(), IOError() diff -Nru nova-22.0.1/nova/virt/disk/vfs/guestfs.py nova-22.2.2/nova/virt/disk/vfs/guestfs.py --- nova-22.0.1/nova/virt/disk/vfs/guestfs.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/virt/disk/vfs/guestfs.py 2021-07-09 05:08:56.000000000 +0200 @@ -308,7 +308,14 @@ def read_file(self, path): LOG.debug("Read file path=%s", path) path = self._canonicalize_path(path) - return self.handle.read_file(path) + data = self.handle.read_file(path) + # NOTE(lyarwood): libguestfs v1.41.1 (0ee02e0117527) switched the + # return type of read_file from string to bytes and as such we need to + # handle both here, decoding and returning a string if bytes is + # provided. https://bugzilla.redhat.com/show_bug.cgi?id=1661871 + if isinstance(data, bytes): + return data.decode() + return data def has_file(self, path): LOG.debug("Has file path=%s", path) diff -Nru nova-22.0.1/nova/virt/libvirt/driver.py nova-22.2.2/nova/virt/libvirt/driver.py --- nova-22.0.1/nova/virt/libvirt/driver.py 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/nova/virt/libvirt/driver.py 2021-07-09 05:08:56.000000000 +0200 @@ -1791,6 +1791,17 @@ to determine if an attempt to attach the encryptor should be made. """ + # NOTE(lyarwood): Skip any attempt to fetch encryption metadata or the + # actual passphrase from the key manager if a libvirt secert already + # exists locally for the volume. This suggests that the instance was + # only powered off or the underlying host rebooted. + volume_id = driver_block_device.get_volume_id(connection_info) + if self._host.find_secret('volume', volume_id): + LOG.debug("A libvirt secret for volume %s has been found on the " + "host, skipping any attempt to create another or attach " + "an os-brick encryptor.", volume_id) + return + if encryption is None: encryption = self._get_volume_encryption(context, connection_info) @@ -1822,7 +1833,6 @@ # NOTE(lyarwood): Store the passphrase as a libvirt secret locally # on the compute node. This secret is used later when generating # the volume config. - volume_id = driver_block_device.get_volume_id(connection_info) self._host.create_secret('volume', volume_id, password=passphrase) elif encryption: encryptor = self._get_volume_encryptor(connection_info, @@ -2150,7 +2160,11 @@ if 'device_path' in connection_info['data']: path = connection_info['data']['device_path'] elif connection_info['driver_volume_type'] == 'rbd': - path = 'rbd:%s' % (connection_info['data']['name']) + volume_name = connection_info['data']['name'] + path = f"rbd:{volume_name}" + if connection_info['data'].get('auth_enabled'): + username = connection_info['data']['auth_username'] + path = f"rbd:{volume_name}:id={username}" else: path = 'unknown' raise exception.DiskNotFound(location='unknown') @@ -3008,7 +3022,16 @@ # If the rebased image is going to have a backing file then # explicitly set the backing file format to avoid any security # concerns related to file format auto detection. - backing_file = rebase_base + if os.path.isabs(rebase_base): + backing_file = rebase_base + else: + # this is a probably a volume snapshot case where the + # rebase_base is relative. See bug + # https://bugs.launchpad.net/nova/+bug/1885528 + backing_file_name = os.path.basename(rebase_base) + volume_path = os.path.dirname(source_path) + backing_file = os.path.join(volume_path, backing_file_name) + b_file_fmt = images.qemu_img_info(backing_file).file_format qemu_img_extra_arg = ['-F', b_file_fmt] @@ -6871,6 +6894,11 @@ if not CONF.devices.enabled_vgpu_types: return [] + # Make sure we register all the types as the compute service could + # be calling this method before init_host() + if len(CONF.devices.enabled_vgpu_types) > 1: + nova.conf.devices.register_dynamic_opts(CONF) + for vgpu_type in CONF.devices.enabled_vgpu_types: group = getattr(CONF, 'vgpu_%s' % vgpu_type, None) if group is None or not group.device_addresses: @@ -7786,16 +7814,17 @@ resources: ty.Dict[str, ty.Set['objects.Resource']] = ( collections.defaultdict(set) ) - result = { - orc.MEMORY_MB: { + + result = {} + if memory_mb: + result[orc.MEMORY_MB] = { 'total': memory_mb, 'min_unit': 1, 'max_unit': memory_mb, 'step_size': 1, 'allocation_ratio': ratios[orc.MEMORY_MB], 'reserved': CONF.reserved_host_memory_mb, - }, - } + } # NOTE(stephenfin): We have to optionally report these since placement # forbids reporting inventory with total=0 @@ -7836,15 +7865,17 @@ # compute RP once the issues from bug #1784020 have been resolved. if provider_tree.has_sharing_provider(orc.DISK_GB): LOG.debug('Ignoring sharing provider - see bug #1784020') - result[orc.DISK_GB] = { - 'total': disk_gb, - 'min_unit': 1, - 'max_unit': disk_gb, - 'step_size': 1, - 'allocation_ratio': ratios[orc.DISK_GB], - 'reserved': (self._get_reserved_host_disk_gb_from_config() + - self._get_disk_size_reserved_for_image_cache()), - } + + if disk_gb: + result[orc.DISK_GB] = { + 'total': disk_gb, + 'min_unit': 1, + 'max_unit': disk_gb, + 'step_size': 1, + 'allocation_ratio': ratios[orc.DISK_GB], + 'reserved': (self._get_reserved_host_disk_gb_from_config() + + self._get_disk_size_reserved_for_image_cache()), + } # TODO(sbauza): Use traits to providing vGPU types. For the moment, # it will be only documentation support by explaining to use @@ -7879,6 +7910,10 @@ """Update resources and inventory for vpmems in provider tree.""" prov_data = provider_tree.data(nodename) for rc, vpmems in self._vpmems_by_rc.items(): + # Skip (and omit) inventories with total=0 because placement does + # not allow setting total=0 for inventory. + if not len(vpmems): + continue inventory[rc] = { 'total': len(vpmems), 'max_unit': len(vpmems), @@ -7991,6 +8026,10 @@ # Dict of PGPU RPs keyed by their libvirt PCI name pgpu_rps = {} for pgpu_dev_id, inventory in inventories_dict.items(): + # Skip (and omit) inventories with total=0 because placement does + # not allow setting total=0 for inventory. + if not inventory['total']: + continue # For each physical GPU, we make sure to have a child provider pgpu_rp_name = '%s_%s' % (nodename, pgpu_dev_id) if not provider_tree.exists(pgpu_rp_name): diff -Nru nova-22.0.1/playbooks/legacy/nova-grenade-multinode/post.yaml nova-22.2.2/playbooks/legacy/nova-grenade-multinode/post.yaml --- nova-22.0.1/playbooks/legacy/nova-grenade-multinode/post.yaml 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/playbooks/legacy/nova-grenade-multinode/post.yaml 1970-01-01 01:00:00.000000000 +0100 @@ -1,15 +0,0 @@ -- hosts: primary - tasks: - - - name: Copy files from {{ ansible_user_dir }}/workspace/ on node - synchronize: - src: '{{ ansible_user_dir }}/workspace/' - dest: '{{ zuul.executor.log_root }}' - mode: pull - copy_links: true - verify_host: true - rsync_opts: - - --include=/logs/** - - --include=*/ - - --exclude=* - - --prune-empty-dirs diff -Nru nova-22.0.1/playbooks/legacy/nova-grenade-multinode/run.yaml nova-22.2.2/playbooks/legacy/nova-grenade-multinode/run.yaml --- nova-22.0.1/playbooks/legacy/nova-grenade-multinode/run.yaml 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/playbooks/legacy/nova-grenade-multinode/run.yaml 1970-01-01 01:00:00.000000000 +0100 @@ -1,65 +0,0 @@ -- hosts: primary - name: nova-grenade-multinode - tasks: - - - name: Ensure legacy workspace directory - file: - path: '{{ ansible_user_dir }}/workspace' - state: directory - - - shell: - cmd: | - set -e - set -x - cat > clonemap.yaml << EOF - clonemap: - - name: openstack/devstack-gate - dest: devstack-gate - EOF - /usr/zuul-env/bin/zuul-cloner -m clonemap.yaml --cache-dir /opt/git \ - https://opendev.org \ - openstack/devstack-gate - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - shell: - cmd: | - set -e - set -x - export PROJECTS="openstack/grenade $PROJECTS" - export PYTHONUNBUFFERED=true - export DEVSTACK_GATE_CONFIGDRIVE=0 - export DEVSTACK_GATE_NEUTRON=1 - # NOTE(mriedem): Run tempest smoke tests specific to compute on the - # new side of the grenade environment. The post-test hook script will - # run non-smoke migration tests in a local/block and shared/ceph - # setup. Note that grenade hard-codes "tox -esmoke" for tempest on - # the old side so the regex is not appied there. - export DEVSTACK_GATE_TEMPEST=1 - export DEVSTACK_GATE_TEMPEST_REGEX="tempest\.(api\.compute|scenario)\..*smoke.*" - export DEVSTACK_GATE_GRENADE=pullup - export DEVSTACK_GATE_USE_PYTHON3=True - # By default grenade runs only smoke tests so we need to set - # RUN_SMOKE to False in order to run live migration tests using - # grenade - export DEVSTACK_LOCAL_CONFIG="RUN_SMOKE=False" - # LIVE_MIGRATE_BACK_AND_FORTH will tell Tempest to run a live - # migration of the same instance to one compute node and then back - # to the other, which is mostly only interesting for grenade since - # we have mixed level computes. - export DEVSTACK_LOCAL_CONFIG+=$'\n'"LIVE_MIGRATE_BACK_AND_FORTH=True" - export BRANCH_OVERRIDE=default - export DEVSTACK_GATE_TOPOLOGY="multinode" - if [ "$BRANCH_OVERRIDE" != "default" ] ; then - export OVERRIDE_ZUUL_BRANCH=$BRANCH_OVERRIDE - fi - function post_test_hook { - /opt/stack/new/nova/gate/live_migration/hooks/run_tests.sh - } - export -f post_test_hook - cp devstack-gate/devstack-vm-gate-wrap.sh ./safe-devstack-vm-gate-wrap.sh - ./safe-devstack-vm-gate-wrap.sh - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' diff -Nru nova-22.0.1/playbooks/legacy/nova-live-migration/post.yaml nova-22.2.2/playbooks/legacy/nova-live-migration/post.yaml --- nova-22.0.1/playbooks/legacy/nova-live-migration/post.yaml 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/playbooks/legacy/nova-live-migration/post.yaml 1970-01-01 01:00:00.000000000 +0100 @@ -1,15 +0,0 @@ -- hosts: primary - tasks: - - - name: Copy files from {{ ansible_user_dir }}/workspace/ on node - synchronize: - src: '{{ ansible_user_dir }}/workspace/' - dest: '{{ zuul.executor.log_root }}' - mode: pull - copy_links: true - verify_host: true - rsync_opts: - - --include=/logs/** - - --include=*/ - - --exclude=* - - --prune-empty-dirs diff -Nru nova-22.0.1/playbooks/legacy/nova-live-migration/run.yaml nova-22.2.2/playbooks/legacy/nova-live-migration/run.yaml --- nova-22.0.1/playbooks/legacy/nova-live-migration/run.yaml 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/playbooks/legacy/nova-live-migration/run.yaml 1970-01-01 01:00:00.000000000 +0100 @@ -1,60 +0,0 @@ -- hosts: primary - name: nova-live-migration - tasks: - - - name: Ensure legacy workspace directory - file: - path: '{{ ansible_user_dir }}/workspace' - state: directory - - - shell: - cmd: | - set -e - set -x - cat > clonemap.yaml << EOF - clonemap: - - name: openstack/devstack-gate - dest: devstack-gate - EOF - /usr/zuul-env/bin/zuul-cloner -m clonemap.yaml --cache-dir /opt/git \ - https://opendev.org \ - openstack/devstack-gate - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - name: Configure devstack - shell: - # Force config drive. - cmd: | - set -e - set -x - cat << 'EOF' >>"/tmp/dg-local.conf" - [[local|localrc]] - FORCE_CONFIG_DRIVE=True - - EOF - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' - - - shell: - cmd: | - set -e - set -x - export PYTHONUNBUFFERED=true - export DEVSTACK_GATE_CONFIGDRIVE=0 - export DEVSTACK_GATE_TEMPEST=1 - export DEVSTACK_GATE_TEMPEST_NOTESTS=1 - export DEVSTACK_GATE_TOPOLOGY="multinode" - export DEVSTACK_GATE_USE_PYTHON3=True - function post_test_hook { - /opt/stack/new/nova/gate/live_migration/hooks/run_tests.sh - $BASE/new/nova/gate/test_evacuate.sh - } - export -f post_test_hook - cp devstack-gate/devstack-vm-gate-wrap.sh ./safe-devstack-vm-gate-wrap.sh - ./safe-devstack-vm-gate-wrap.sh - executable: /bin/bash - chdir: '{{ ansible_user_dir }}/workspace' - environment: '{{ zuul | zuul_legacy_vars }}' diff -Nru nova-22.0.1/playbooks/nova-evacuate/run.yaml nova-22.2.2/playbooks/nova-evacuate/run.yaml --- nova-22.0.1/playbooks/nova-evacuate/run.yaml 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/playbooks/nova-evacuate/run.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,8 @@ +--- +- hosts: all + roles: + - orchestrate-devstack + +- hosts: controller + roles: + - run-evacuate-hook diff -Nru nova-22.0.1/playbooks/nova-live-migration/post-run.yaml nova-22.2.2/playbooks/nova-live-migration/post-run.yaml --- nova-22.0.1/playbooks/nova-live-migration/post-run.yaml 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/playbooks/nova-live-migration/post-run.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,10 @@ +--- +- hosts: tempest + become: true + roles: + - role: fetch-subunit-output + zuul_work_dir: '{{ devstack_base_dir }}/tempest' + - role: process-stackviz +- hosts: controller + roles: + - run-evacuate-hook diff -Nru nova-22.0.1/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml nova-22.2.2/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml --- nova-22.0.1/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/releasenotes/notes/bug-1821755-7bd03319e34b6b10.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,11 @@ +--- +fixes: + - | + Improved detection of anti-affinity policy violation when performing live + and cold migrations. Most of the violations caused by race conditions due + to performing concurrent live or cold migrations should now be addressed + by extra checks in the compute service. Upon detection, cold migration + operations are automatically rescheduled, while live migrations have two + checks and will be rescheduled if detected by the first one, otherwise the + live migration will fail cleanly and revert the instance state back to its + previous value. diff -Nru nova-22.0.1/releasenotes/notes/bug_1905701-fdc7402ffe70d104.yaml nova-22.2.2/releasenotes/notes/bug_1905701-fdc7402ffe70d104.yaml --- nova-22.0.1/releasenotes/notes/bug_1905701-fdc7402ffe70d104.yaml 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/releasenotes/notes/bug_1905701-fdc7402ffe70d104.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,13 @@ +--- +fixes: + - | + The libvirt virt driver will no longer attempt to fetch volume + encryption metadata or the associated secret key when attaching ``LUKSv1`` + encrypted volumes if a libvirt secret already exists on the host. + + This resolves `bug 1905701`_ where instances with ``LUKSv1`` encrypted + volumes could not be restarted automatically by the ``nova-compute`` + service after a host reboot when the + ``[DEFAULT]/resume_guests_state_on_host_boot`` configurable was enabled. + + .. _bug 1905701: https://launchpad.net/bugs/1905701 diff -Nru nova-22.0.1/releasenotes/notes/bug-1911924-6e93d8a5038d18c1.yaml nova-22.2.2/releasenotes/notes/bug-1911924-6e93d8a5038d18c1.yaml --- nova-22.0.1/releasenotes/notes/bug-1911924-6e93d8a5038d18c1.yaml 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/releasenotes/notes/bug-1911924-6e93d8a5038d18c1.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,10 @@ +--- +fixes: + - | + The `os-resetState`_ API will now log an instance action when called. The + resulting instance action being visable via the `os-instance-actions`_ API + to users and admins, resolving `bug 1911924`_. + + .. _bug 1911924: https://launchpad.net/bugs/1911924 + .. _os-instance-actions: https://docs.openstack.org/api-ref/compute/?expanded=reset-server-state-os-resetstate-action-detail,list-actions-for-server-detail + .. _os-resetState: https://docs.openstack.org/api-ref/compute/?expanded=reset-server-state-os-resetstate-action-detail diff -Nru nova-22.0.1/releasenotes/notes/cros-scell-resize-not-supported-with-ports-having-resource-request-a8e1029ef5983793.yaml nova-22.2.2/releasenotes/notes/cros-scell-resize-not-supported-with-ports-having-resource-request-a8e1029ef5983793.yaml --- nova-22.0.1/releasenotes/notes/cros-scell-resize-not-supported-with-ports-having-resource-request-a8e1029ef5983793.yaml 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/releasenotes/notes/cros-scell-resize-not-supported-with-ports-having-resource-request-a8e1029ef5983793.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,9 @@ +--- +issues: + - | + When the tempest test coverage was added for resize and cold migrate + with neutron ports having QoS minimum bandwidth policy rules we + discovered that the cross cell resize code path cannot handle such ports. + See bug https://bugs.launchpad.net/nova/+bug/1907522 for details. A fix + was implemented that makes sure that Nova falls back to same-cell resize if + the server has such ports. diff -Nru nova-22.0.1/releasenotes/notes/warn-when-services-started-with-old-compute-fc80b4ff58a2aaea.yaml nova-22.2.2/releasenotes/notes/warn-when-services-started-with-old-compute-fc80b4ff58a2aaea.yaml --- nova-22.0.1/releasenotes/notes/warn-when-services-started-with-old-compute-fc80b4ff58a2aaea.yaml 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/releasenotes/notes/warn-when-services-started-with-old-compute-fc80b4ff58a2aaea.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,9 @@ +--- +upgrade: + - | + Nova services only support old computes if the compute is not + older than the previous major nova release. From now on nova services will + emit a warning at startup if the deployment contains too old compute + services. From the 23.0.0 (Wallaby) release nova services will refuse to + start if the deployment contains too old compute services to prevent + compatibility issues. diff -Nru nova-22.0.1/roles/run-evacuate-hook/files/setup_evacuate_resources.sh nova-22.2.2/roles/run-evacuate-hook/files/setup_evacuate_resources.sh --- nova-22.0.1/roles/run-evacuate-hook/files/setup_evacuate_resources.sh 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/roles/run-evacuate-hook/files/setup_evacuate_resources.sh 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,34 @@ +#!/bin/bash +source /opt/stack/devstack/openrc admin +set -x +set -e + +image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}') +flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}') +network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}') + +echo "Creating ephemeral test server on subnode" +openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \ +--nic net-id=${network_id} --host $SUBNODE_HOSTNAME --wait evacuate-test + +# TODO(lyarwood) Use osc to launch the bfv volume +echo "Creating boot from volume test server on subnode" +nova --os-compute-api-version 2.74 boot --flavor ${flavor_id} --poll \ +--block-device id=${image_id},source=image,dest=volume,size=1,bootindex=0,shutdown=remove \ +--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} evacuate-bfv-test + +echo "Forcing down the subnode so we can evacuate from it" +openstack --os-compute-api-version 2.11 compute service set --down ${SUBNODE_HOSTNAME} nova-compute + +count=0 +status=$(openstack compute service list --host ${SUBNODE_HOSTNAME} --service nova-compute -f value -c State) +while [ "${status}" != "down" ] +do + sleep 1 + count=$((count+1)) + if [ ${count} -eq 30 ]; then + echo "Timed out waiting for subnode compute service to be marked as down" + exit 5 + fi + status=$(openstack compute service list --host ${SUBNODE_HOSTNAME} --service nova-compute -f value -c State) +done diff -Nru nova-22.0.1/roles/run-evacuate-hook/files/test_evacuate.sh nova-22.2.2/roles/run-evacuate-hook/files/test_evacuate.sh --- nova-22.0.1/roles/run-evacuate-hook/files/test_evacuate.sh 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/roles/run-evacuate-hook/files/test_evacuate.sh 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,55 @@ +#!/bin/bash +# Source tempest to determine the build timeout configuration. +source /opt/stack/devstack/lib/tempest +source /opt/stack/devstack/openrc admin +set -x +set -e + +# Wait for the controller compute service to be enabled. +count=0 +status=$(openstack compute service list --host ${CONTROLLER_HOSTNAME} --service nova-compute -f value -c Status) +while [ "${status}" != "enabled" ] +do + sleep 1 + count=$((count+1)) + if [ ${count} -eq 30 ]; then + echo "Timed out waiting for controller compute service to be enabled" + exit 5 + fi + status=$(openstack compute service list --host ${CONTROLLER_HOSTNAME} --service nova-compute -f value -c Status) +done + +function evacuate_and_wait_for_active() { + local server="$1" + + nova evacuate ${server} + # Wait for the instance to go into ACTIVE state from the evacuate. + count=0 + status=$(openstack server show ${server} -f value -c status) + while [ "${status}" != "ACTIVE" ] + do + sleep 1 + count=$((count+1)) + if [ ${count} -eq ${BUILD_TIMEOUT} ]; then + echo "Timed out waiting for server ${server} to go to ACTIVE status" + exit 6 + fi + status=$(openstack server show ${server} -f value -c status) + done +} + +evacuate_and_wait_for_active evacuate-test +evacuate_and_wait_for_active evacuate-bfv-test + +# Make sure the servers moved. +for server in evacuate-test evacuate-bfv-test; do + host=$(openstack server show ${server} -f value -c OS-EXT-SRV-ATTR:host) + if [[ ${host} != ${CONTROLLER_HOSTNAME} ]]; then + echo "Unexpected host ${host} for server ${server} after evacuate." + exit 7 + fi +done + +# Cleanup test servers +openstack server delete --wait evacuate-test +openstack server delete --wait evacuate-bfv-test diff -Nru nova-22.0.1/roles/run-evacuate-hook/files/test_negative_evacuate.sh nova-22.2.2/roles/run-evacuate-hook/files/test_negative_evacuate.sh --- nova-22.0.1/roles/run-evacuate-hook/files/test_negative_evacuate.sh 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/roles/run-evacuate-hook/files/test_negative_evacuate.sh 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,37 @@ +#!/bin/bash +# Source tempest to determine the build timeout configuration. +source /opt/stack/devstack/lib/tempest +source /opt/stack/devstack/openrc admin +set -x +set -e + +# Now force the evacuation to the controller; we have to force to bypass the +# scheduler since we killed libvirtd which will trigger the libvirt compute +# driver to auto-disable the nova-compute service and then the ComputeFilter +# would filter out this host and we'd get NoValidHost. Normally forcing a host +# during evacuate and bypassing the scheduler is a very bad idea, but we're +# doing a negative test here. + +function evacuate_and_wait_for_error() { + local server="$1" + + echo "Forcing evacuate of ${server} to local host" + # TODO(mriedem): Use OSC when it supports evacuate. + nova --os-compute-api-version "2.67" evacuate --force ${server} ${CONTROLLER_HOSTNAME} + # Wait for the instance to go into ERROR state from the failed evacuate. + count=0 + status=$(openstack server show ${server} -f value -c status) + while [ "${status}" != "ERROR" ] + do + sleep 1 + count=$((count+1)) + if [ ${count} -eq ${BUILD_TIMEOUT} ]; then + echo "Timed out waiting for server ${server} to go to ERROR status" + exit 4 + fi + status=$(openstack server show ${server} -f value -c status) + done +} + +evacuate_and_wait_for_error evacuate-test +evacuate_and_wait_for_error evacuate-bfv-test diff -Nru nova-22.0.1/roles/run-evacuate-hook/README.rst nova-22.2.2/roles/run-evacuate-hook/README.rst --- nova-22.0.1/roles/run-evacuate-hook/README.rst 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/roles/run-evacuate-hook/README.rst 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1 @@ +Run Nova evacuation tests against a multinode environment. diff -Nru nova-22.0.1/roles/run-evacuate-hook/tasks/main.yaml nova-22.2.2/roles/run-evacuate-hook/tasks/main.yaml --- nova-22.0.1/roles/run-evacuate-hook/tasks/main.yaml 1970-01-01 01:00:00.000000000 +0100 +++ nova-22.2.2/roles/run-evacuate-hook/tasks/main.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -0,0 +1,82 @@ +- name: Setup resources and mark the subnode as forced down + become: true + become_user: stack + shell: "/opt/stack/nova/roles/run-evacuate-hook/files/setup_evacuate_resources.sh" + environment: + SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}" + +- name: Fence subnode by stopping q-agt and n-cpu + delegate_to: compute1 + become: true + systemd: + name: "{{ item }}" + state: stopped + with_items: + - devstack@q-agt + - devstack@n-cpu + +- name: Register running domains on subnode + delegate_to: compute1 + become: true + virt: + command: list_vms + state: running + register: subnode_vms + +- name: Destroy running domains on subnode + delegate_to: compute1 + become: true + virt: + name: "{{ item }}" + state: destroyed + with_items: "{{ subnode_vms.list_vms }}" + +- name: Stop libvirtd on "{{ inventory_hostname }}" + become: true + systemd: + name: "{{ item }}" + state: stopped + enabled: no + with_items: + - libvirtd.service + - libvirtd.socket + - libvirtd-admin.socket + - libvirtd-ro.socket + - virtlogd.service + - virtlogd-admin.socket + - virtlogd.socket + - virtlockd.service + - virtlockd-admin.socket + - virtlockd.socket + +- name: Run negative evacuate tests + become: true + become_user: stack + shell: "/opt/stack/nova/roles/run-evacuate-hook/files/test_negative_evacuate.sh" + environment: + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" + +- name: Start libvirtd on "{{ inventory_hostname }}" + become: true + systemd: + name: "{{ item }}" + state: started + enabled: yes + with_items: + - libvirtd.service + - libvirtd.socket + - libvirtd-admin.socket + - libvirtd-ro.socket + - virtlogd.service + - virtlogd-admin.socket + - virtlogd.socket + - virtlockd.service + - virtlockd-admin.socket + - virtlockd.socket + +- name: Run evacuate tests + become: true + become_user: stack + shell: "/opt/stack/nova/roles/run-evacuate-hook/files/test_evacuate.sh" + environment: + CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}" diff -Nru nova-22.0.1/tools/check-cherry-picks.sh nova-22.2.2/tools/check-cherry-picks.sh --- nova-22.0.1/tools/check-cherry-picks.sh 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/tools/check-cherry-picks.sh 2021-07-09 05:08:56.000000000 +0200 @@ -4,6 +4,11 @@ # to verify that they're all on either master or stable/ branches # +# Allow this script to be disabled by a simple env var +if [ ${DISABLE_CHERRY_PICK_CHECK:-0} -eq 1 ]; then + exit 0 +fi + commit_hash="" # Check if the patch is a merge patch by counting the number of parents. diff -Nru nova-22.0.1/tox.ini nova-22.2.2/tox.ini --- nova-22.0.1/tox.ini 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/tox.ini 2021-07-09 05:08:56.000000000 +0200 @@ -49,6 +49,8 @@ description = Run style checks. envdir = {toxworkdir}/shared +passenv = + DISABLE_CHERRY_PICK_CHECK commands = {[testenv:mypy]commands} bash tools/flake8wrap.sh {posargs} diff -Nru nova-22.0.1/.zuul.yaml nova-22.2.2/.zuul.yaml --- nova-22.0.1/.zuul.yaml 2020-11-14 11:05:51.000000000 +0100 +++ nova-22.2.2/.zuul.yaml 2021-07-09 05:08:56.000000000 +0200 @@ -56,21 +56,8 @@ bindep_profile: test py38 timeout: 3600 -# TODO(lyarwood): Remove once the new zuulv3 nova-multinode jobs are voting - job: name: nova-live-migration - parent: nova-dsvm-multinode-base - description: | - Run tempest live migration tests against both local storage and shared - storage using ceph (the environment is reconfigured for ceph after the - local storage tests are run). Also runs simple evacuate tests. - Config drive is forced on all instances. Runs with python 3. - run: playbooks/legacy/nova-live-migration/run.yaml - post-run: playbooks/legacy/nova-live-migration/post.yaml - irrelevant-files: *dsvm-irrelevant-files - -- job: - name: nova-multinode-live-migration parent: tempest-multinode-full-py3 description: | Run tempest live migration tests against local qcow2 ephemeral storage @@ -86,6 +73,7 @@ volume_backed_live_migration: true block_migration_for_live_migration: true block_migrate_cinder_iscsi: true + post-run: playbooks/nova-live-migration/post-run.yaml # TODO(lyarwood): The following jobs need to be written as part of the # migration to zuulv3 before nova-live-migration can be removed: @@ -95,13 +83,6 @@ # description: | # Run tempest live migration tests against ceph ephemeral storage and # cinder volumes. -# -#- job: -# name: nova-multinode-evacuate -# description: | -# Verifiy the evacuation of instances with local qcow2 ephemeral disks -# from down compute hosts. -# #- job: # name: nova-multinode-evacuate-ceph # description: | @@ -272,22 +253,24 @@ - job: name: nova-grenade-multinode - parent: nova-dsvm-multinode-base + parent: grenade-multinode description: | - Multi-node grenade job which runs gate/live_migration/hooks tests under - python 3. - In other words, this tests live and cold migration and resize with - mixed-version compute services which is important for things like - rolling upgrade support. + Run a multinode grenade job and run the smoke, cold and live migration + tests with the controller upgraded and the compute on the older release. The former names for this job were "nova-grenade-live-migration" and "legacy-grenade-dsvm-neutron-multinode-live-migration". - run: playbooks/legacy/nova-grenade-multinode/run.yaml - post-run: playbooks/legacy/nova-grenade-multinode/post.yaml - required-projects: - - openstack/grenade - - openstack/devstack-gate - - openstack/nova irrelevant-files: *dsvm-irrelevant-files + vars: + devstack_local_conf: + test-config: + $TEMPEST_CONFIG: + compute-feature-enabled: + live_migration: true + volume_backed_live_migration: true + block_migration_for_live_migration: true + block_migrate_cinder_iscsi: true + tox_envlist: all + tempest_test_regex: ((tempest\.(api\.compute|scenario)\..*smoke.*)|(^tempest\.api\.compute\.admin\.(test_live_migration|test_migration))) - job: name: nova-multi-cell @@ -437,10 +420,7 @@ # so that we only run it on changes to networking and libvirt/vif # code; we don't need to run this on all changes. - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$ - - nova-grenade-multinode - nova-live-migration - - nova-multinode-live-migration: - voting: false - nova-lvm - nova-multi-cell - nova-next @@ -464,7 +444,7 @@ - ^setup.cfg$ - ^tools/.*$ - ^tox.ini$ - - grenade: + - nova-grenade-multinode: irrelevant-files: *policies-irrelevant-files - tempest-ipv6-only: irrelevant-files: *dsvm-irrelevant-files @@ -478,7 +458,6 @@ voting: false gate: jobs: - - nova-grenade-multinode - nova-live-migration - nova-tox-functional-py38 - nova-multi-cell @@ -493,7 +472,7 @@ - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$ - tempest-integrated-compute: irrelevant-files: *policies-irrelevant-files - - grenade: + - nova-grenade-multinode: irrelevant-files: *policies-irrelevant-files - tempest-ipv6-only: irrelevant-files: *dsvm-irrelevant-files