Drawing a skeleton for the movenet p5js sketch

Hi, I have been trying to run movenet with p5js. I have managed to get the detector working and can log out the poses. However , I am trying to draw a skeleton on top of the connected circle for vizualization purposes. I have looked into posenet as it has similar 17 keypoints. However, I noticed that some functions such as posenet.getAdjacentKeypoints are not implemented in movenet. skeleton. My sample trial sketch (inspired by coding train is below).

The keypoints are following the COCO keypoint format. coco keypoint format

let detector;
let poses;
let video;
async function init() {
  const detectorConfig = {
    modelType: poseDetection.movenet.modelType.SINGLEPOSE_LIGHTNING,
  };
  detector = await poseDetection.createDetector(
    poseDetection.SupportedModels.MoveNet,
    detectorConfig
  );
}

async function videoReady() {
  console.log("video ready");
  await getPoses();
}

async function setup() {
  createCanvas(640, 480);
  video = createCapture(VIDEO, videoReady);
  video.hide();

  await init();
  //createButton('pose').mousePressed(getPoses)
}

async function getPoses() {
  poses = await detector.estimatePoses(video.elt);
  setTimeout(getPoses, 0);
}

function draw() {
  background(220);
  image(video, 0, 0);
  if (poses && poses.length > 0) {
    //console.log(poses[0].keypoints.length)
    //console.log(poses[0].keypoints[0].x);
    for (let kp of poses[0].keypoints) {
      const { x, y, score } = kp;
      console.log(kp);
      if (score > 0.5) {
        fill(255);
        stroke(0);
        strokeWeight(4);
        circle(x, y, 16);
       }
    
    }
    
    for (let i = 0; i < poses[0].keypoints.length ; i ++)
      {
        // Get adjacent keypoints (Start with nose and left_eye)
        let x = poses[0].keypoints.length.nose
       
      }
    
 
    
  } 
  
  
}

If i understand the line functions needs both the x1,x2 and y1,y2 co-ordinates to effectively join the points. I was wondering if anyone has managed to overlay the skeleton

When asking questions like this it is a good idea to mention specifically what libraries you are using, and if possible include a link to a complete, runnable sketch on OpenProcessing.org, editor.p5js.org, etc… Otherwise anybody who would like to help you has to do a bunch of unnecessary research just to figure out what you are asking.

1 Like

There seems to be very limited documentation for MoveNet, but I was able to deduce that there doesn’t seem to be a built in data structure defining edges between “keypoints”. However since each keypoint is always at the same position in the array it is trivial to define your own graph as a list of edges. I made a quick example that goes one step further by defining synthetic vertices which are the average of existing keypoints (since there are no keypoints for things like the center of the chest or center of the waist).

const ScoreThreshold = 0.4;

let detector;
let poses;
let video;

async function init() {
	console.log("initializing");
	const detectorConfig = {
		modelType: poseDetection.movenet.modelType.SINGLEPOSE_LIGHTNING,
	};
	detector = await poseDetection.createDetector(
		poseDetection.SupportedModels.MoveNet,
		detectorConfig
	);
}

async function videoReady() {
	console.log("video ready");
	await getPoses();
}

async function setup() {
	createCanvas(640, 480);
	textSize(16);
	textAlign(CENTER, CENTER);

	await init();

	video = createCapture(VIDEO, videoReady);
	video.hide();

	//createButton('pose').mousePressed(getPoses)
	console.log("setup complete");
}

async function getPoses() {
	poses = await detector.estimatePoses(video.elt);
	setTimeout(getPoses, 0);
}

let first = true;

function mouseClicked() {
	console.log(poses);
}

// A list of pairs of either keypoint indices or sub lists of keypoint indicies
// Each pair defines an edge in the skeleton "graph"
// When a pair contains a sublist, that is meant to represent the average of two keypoints
const skeleton = [
	[0, 1],
	[0, 2],
	[1, 3],
	[2, 4],
	[0, [6, 5]],
	[6, 5],
	[5, 7],
	[6, 8],
	[7, 9],
	[8, 10],
	[
		[5, 6],
		[11, 12]
	],
	[
		[11, 12], 11
	],
	[
		[11, 12], 12
	],
	[11, 13],
	[12, 14],
	[13, 15],
	[14, 16],
];

function getKeypointForEdgeVertex(keypoints, vertex) {
	if (typeof vertex === "number") {
		const {
			x,
			y,
			score
		} = keypoints[vertex];
		if (score > ScoreThreshold) {
			return { x, y };
		}
	} else if (vertex instanceof Array) {
		const points = vertex.map(v => keypoints[v]);
		if (points.every(kp => kp.score > ScoreThreshold)) {
			const { x, y } =
						// Average the points
						points.reduce(
							(acc, v) => ({
								x: (acc.x * acc.w + v.x) / (acc.w + 1),
								y: (acc.y * acc.w + v.y) / (acc.w + 1),
								w: acc.w + 1
							}),
							{ x: 0, y: 0, w: 0 }
						);
		  return { x, y };
		}
	}
}

function draw() {
	if (first) {
		console.log("drawing");
		first = false;
	}
	background(220);
	if (video) {
		image(video, 0, 0);
		if (poses && poses.length > 0) {
			//console.log(poses[0].keypoints.length)
			//console.log(poses[0].keypoints[0].x);
			stroke('green');
			strokeWeight(2);
			for (let edge of skeleton) {
				let start = getKeypointForEdgeVertex(poses[0].keypoints, edge[0]);
				let end = getKeypointForEdgeVertex(poses[0].keypoints, edge[1]);

				if (start && end) {
					line(start.x, start.y, end.x, end.y);
				}
			}

			for (let i = 0; i < poses[0].keypoints.length; i++) {
				const {
					x,
					y,
					score
				} = poses[0].keypoints[i];
				// console.log(kp);
				if (score > ScoreThreshold) {
					fill(255);
					stroke(0);
					strokeWeight(4);
					circle(x, y, 16);

					push();
					fill('red');
					noStroke();
					text(`${i}`, x, y);
					pop();
				}
			}
		}
	}
}

Depends on:

<script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.4.0/p5.js"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-core"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-converter"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-backend-webgl"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/pose-detection"></script>
1 Like

Thanks a lot ! @KumuPaul , i actually tried to access keypoints manually and connect them with line. But this solutions looks much more elegant and better. Also seems to have no jitters as such .

Can i ask , if its possible to have different colors with different joints ? Anyways this solution is very elegant

Something like this was my idea

//0-2
      line(poses[0].keypoints[0].x ,poses[0].keypoints[0].y ,poses[0].keypoints[2].x ,poses[0].keypoints[2].y )
      stroke(255, 0, 0);
      //2-4 
      line(poses[0].keypoints[2].x ,poses[0].keypoints[2].y ,poses[0].keypoints[4].x ,poses[0].keypoints[4].y )
      stroke(255, 85, 0)
      //0-1
      line(poses[0].keypoints[0].x ,poses[0].keypoints[0].y ,poses[0].keypoints[1].x ,poses[0].keypoints[1].y )
      stroke(255, 255, 0)

To add colors you just need to add that information to the skeleton data structure. The best way to do this would be to replace my arrays with objects:

const skeleton = [
  { edge: [0, 1], color: [255, 0, 0] }, 
  // ...
];

Alternately you could have separate array with just color information:

const skeletonColors = [
  [255, 0, 0],
  // ...
];

Then the drawing code would look like:

for (let i = 0; i < skeleton.length; i++) {
  let edge = skeleton[i];
  let start = getKeypointForEdgeVertex(poses[0].keypoints, edge[0]);
  let end = getKeypointForEdgeVertex(poses[0].keypoints, edge[1]);

  if (start && end) {
    stroke(...skeletonColors[i]);
    line(start.x, start.y, end.x, end.y);
  }
}
1 Like

@KumuPaul . Great ! thanks. I think the manual way of doing was not the correct way . i think . The visualizations looked quite bad. The solution you provided looks quite structured . Another thing i was experimenting around was to run the model in full screen mode. However the skeleton did not scale to the required changes . What i basically did was to change

createCanvas(windowWidth, windowHeight); and the drawing function to image(video, 0, 0,windowWidth,windowHeight);`

The poses are definitely detected but they are not overlaid on the body but it seems to be displaced a bit. It seems it works with dimension 640,480

The coordinates for the keypoints are going to be based on the dimensions of the video. So if you scale the video image to a different size you’re also going to need to map the skeleton coordinates to that size. I’ve updated the example above to support scaling the video and skeleton. Here are some relevant code snippets:

		// calculate the size of the video such that it fits in the window.
		let vw, vh;
		// This isn't valid during setup() for some reason
	  videoAspect = video.width / video.height;
		if (screenAspect >= videoAspect) {
			// The screen is wider than the video
			vh = height;
			vw = height * videoAspect;
		} else {
			// The video is wider than the screen
			vw = width;
			vh = width / videoAspect;
		}
// ...

		// One way to adjust the skeleton to match the video size would be to use
		// scale():
		//     scale(vw / video.width, vh / video.height);
		// However this scales stroke an text size as well. And if we wanted to flip
		// the skeleton horinontally (using a negative scaling factor in the x axis),
		// this would also flip the text.
		//
		// To only adjust position we can use the map() function like so:
		//     map(x, 0, video.width, vw, 0)
		// Note that the ordering of the input range and output range is flipped in
		// order to mirror the skeleton.
		
		// helper functions
		const mapX = (x) => map(x, 0, video.width, vw, 0);
		const mapY = (y) => map(y, 0, video.height, 0, vh);
1 Like

@KumuPaul thank you for the detailed explanation. I am quite new to JS and have a more python background. I will have to go through the concepts of map functions . Thanks for your detailed explanations , it is quite helpful.

I was further looking into the documentation and was curious about Normalized keypoints and was thinking if you had any idea on that ?? ( Basically i know normaiization is done when the person is perhaps to far or too close to the camera ) . Normalization.

However it seems like the method poseDetection.calculator.keypointsToNormalizedKeypoints(keypoints, imageSize) ,does not seem to exist . Any idea on that ?

The map() function looks complicated but it is pretty straightforward. It takes a input value, the range in which that value might fall, and an output range, and it returns an output value that falls within the specified output range according to where in the input range the input value was:

map(
  inputValue,
  inputRangeMinimum,
  inputRangeMaximum,
  outputRangeMinimum,
  outputRangeMaximum
);

So if you gave an input of 5, and an input range of 0, 10, and an output range of 0, 1, then the result would be 0.5 because just as 5 is halfway between 0 and 10, so 0.5 is halfway between 0 and 1.

The exact math looks something like (input - inputRangeMinimum) / (inputRangeMaximum - inputRangeMinimum) * (outputRangeMaximum - outputRangeMinimum) + outputRangeMinimum.

Normalization is going to take positions ranging from 0 to video.width in the x dimension and 0 to video.height in the y dimension and convert them to coordinates from 0 to 1 in both dimensions. So that would be trivial to implement with the map() function

1 Like

Thank you for the explanation. So for normalization aspect , i can just take the output of the keypoints and divide the x keypoints with video.width and y co cordinate with video.height.

I actually came across this comment in tensorflow forum group .
p5js drawing twice
with

Also it seems P5 does some rendering magic of its own - it draws the video frame to canvas and then the dots on top of that, which is not terribly efficient as you are sampling video frame twice - you can just absolute position canvas on top of video element and draw the circles only on top of the already playing video saving you pushing twice the number of video pixel data around and only needing to worry about rendering dots to canvas based on the rendered size of the canvas.

I am a bit confused by what is meant by rendering it twice ?

In order to process and display video content from a webcam p5.js creates a <video /> HTML element and then loads image data for the current frame when rendering it to the canvas. I believe what that quotation is suggesting is that it would be more performant to allow the browser to just display the original HTML element and then display the canvas element over top of it. This is probably true since the browser can probably use hardware acceleration and/or faster algorithms when displaying the <video /> element directly. However I haven’t seen problems when hiding the video element and displaying the video frames with the image function.

1 Like

Thank you for the reply. I was trying the sketch on a mobile device and I had changed the capture module for forward looking camera. I observed that the video gets mirrored on X axis. Is it because of the translate function


translate(-vw, 0);

I changed the video capture method to the following

var constraints = {
    audio: false,
    video: {
      facingMode: {
        exact: "environment"
    }    
  };

and did the video stream capture as follows : video = createCapture(constraints, videoReady);

What could be the best way to overcome the mirroring effect . I tried to remove the translate function all together . I think it rendered out of the canvas .

In order to undo the mirroring from my example you will need to get rid of the scale(1, -1) as well as the translate(-vw, 0). The scale flips everything, and then the translate shifts it back on screen (because the flip happens “around” the left hand side of the screen).

You will also need to un-mirror the points and lines:

    // This is mirrored because the output range is in the reverse direction of the input
    const mapX = (x) => map(x, 0, video.width, vw, 0);

    // This is not mirrored:
    const mapX = (x) => map(x, 0, video.width, 0, vw);
1 Like

(post deleted by author)