Doubts regarding pose estimation on 2D video using posenet?

Hello I have been playing with the pre trained model for posenet . I had a general question, I tried to do pose estimation on a random video . I seem to get results totally where the skeleton is drawn totally off the human. I was wondering if posenet is not good for detecting poses from a video sequence or it is good for pose estimation with webcams , where the person / persons are facing the webcam . In the video sequence the camera is changing ( we have different angles ). It could be also wrong in something I am doing in the code . Help and explanation would be appreciated .


var video;
// file from:'';
// var videofile = 'data/big_buck_bunny.ogv';
var videofile = 'test.mp4';
let poseNet;
let poses = [];
let skeletons = [];
var imageScaleFactor= 0.3;
var outputStride=16 ;
var minConfidence = 0.5;
var maxPoseDetections = 1;
var scoreThreshold = 0.5 ;
var multiplier = 0.75;

function setup() {
  createCanvas(400, 400);
  video = createVideo(videofile,onLoad);
  poseNet = ml5.poseNet(video,imageScaleFactor,outputStride,minConfidence, maxPoseDetections,scoreThreshold,multiplier);
  //poseNet.on('pose', gotPoses);
  poseNet.on('pose', function (results) {
    poses = results;


function onLoad() {     // This function is called when the video loads
//  print("start auto play after load");
  print("mouse click to start");

function modelReady() {
  console.log('model ready');

//Pose Net 
function gotPoses(poses) {

function draw() {
  image(video, 0, 0, width, height);

  // We can call both functions to draw all keypoints and the skeletons
//=======Draw skeleton and keypoints ===/

// A function to draw ellipses over the detected keypoints
function drawKeypoints()  {
  // Loop through all the poses detected
  for (let i = 0; i < poses.length; i++) {
    // For each pose detected, loop through all the keypoints
    for (let j = 0; j < poses[i].pose.keypoints.length; j++) {
      // A keypoint is an object describing a body part (like rightArm or leftShoulder)
      let keypoint = poses[i].pose.keypoints[j];
      // Only draw an ellipse is the pose probability is bigger than 0.2
      if (keypoint.score > 0.1) {
        fill(255, 0, 0);
        ellipse(keypoint.position.x, keypoint.position.y, 10, 10);

// A function to draw the skeletons
function drawSkeleton() {
  // Loop through all the skeletons detected
  for (let i = 0; i < poses.length; i++) {
    // For every skeleton, loop through all body connections
    for (let j = 0; j < poses[i].skeleton.length; j++) {
      let partA = poses[i].skeleton[j][0];
      let partB = poses[i].skeleton[j][1];
      stroke(255, 0, 0);
      line(partA.position.x, partA.position.y, partB.position.x, partB.position.y);

//======End skeleton ============/
function mousePressed() {
  video.loop(); // set the video to loop mode ( and start )
  print("set loop mode");

I don’t think cuts in the video content should matter – I think (don’t know for certain) that each video frame gets an independent estimate. That said, I do think it is optimized for centered subjects facing the camera – and/or those would give best results as they contain more information.

I’m not sure, but I think that getting some erroneous arm data on certain frames is pretty normal – I can see that happening in this writeup, for example, with the center person’s arms often not reading correctly:

Thank you ! I was also under the impression that there was a pose estimation on each frame. From the examples, that one sees , as you mentioned the subject must be camera facing . I am not sure who to ask . Raised this as a issue in the github repository. Not sure if thats the right place. ’